From 061e368fe213bd0701261a3e59f796c7439484fc Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Mon, 29 May 2023 10:32:28 +0100 Subject: [PATCH 001/704] [SelectionDAG] Implement soft FP legalisation for bf16 FP_EXTEND and BF16_TO_FP As discussed in D151436, it's safe to do this as a simple shift (as is done in LegalizeDAG.cpp) rather than needing a libcall. The added test cases for RISC-V previously just triggered an assertion. Codegen for bfloat_to_double will be slightly improved by D151434. Differential Revision: https://reviews.llvm.org/D151563 --- .../SelectionDAG/LegalizeFloatTypes.cpp | 29 ++++- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + llvm/test/CodeGen/RISCV/bfloat.ll | 116 ++++++++++++++++++ 3 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/bfloat.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index f1e80ce7e037d..29a1951bf9a3a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -107,6 +107,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: R = SoftenFloatRes_FP_ROUND(N); break; case ISD::FP16_TO_FP: R = SoftenFloatRes_FP16_TO_FP(N); break; + case ISD::BF16_TO_FP: R = SoftenFloatRes_BF16_TO_FP(N); break; case ISD::STRICT_FPOW: case ISD::FPOW: R = SoftenFloatRes_FPOW(N); break; case ISD::STRICT_FPOWI: @@ -510,10 +511,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { return BitConvertToInteger(Op); } - // There's only a libcall for f16 -> f32, so proceed in two stages. Also, it's - // entirely possible for both f16 and f32 to be legal, so use the fully - // hard-float FP_EXTEND rather than FP16_TO_FP. - if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) { + // There's only a libcall for f16 -> f32 and shifting is only valid for bf16 + // -> f32, so proceed in two stages. Also, it's entirely possible for both + // f16 and f32 to be legal, so use the fully hard-float FP_EXTEND rather + // than FP16_TO_FP. + if ((Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16) && + N->getValueType(0) != MVT::f32) { if (IsStrict) { Op = DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N), { MVT::f32, MVT::Other }, { Chain, Op }); @@ -523,6 +526,9 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { } } + if (Op.getValueType() == MVT::bf16) + return SoftenFloatRes_BF16_TO_FP(N); + RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); TargetLowering::MakeLibCallOptions CallOptions; @@ -555,6 +561,21 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) { return TLI.makeLibCall(DAG, LC, NVT, Res32, CallOptions, SDLoc(N)).first; } +// FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special +// nodes? +SDValue DAGTypeLegalizer::SoftenFloatRes_BF16_TO_FP(SDNode *N) { + assert(N->getValueType(0) == MVT::f32 && + "Can only soften BF16_TO_FP with f32 result"); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32); + SDValue Op = N->getOperand(0); + SDLoc DL(N); + Op = DAG.getNode(ISD::ANY_EXTEND, DL, NVT, + DAG.getNode(ISD::BITCAST, DL, MVT::i16, Op)); + SDValue Res = DAG.getNode(ISD::SHL, DL, NVT, Op, + DAG.getShiftAmountConstant(16, NVT, DL)); + return Res; +} + SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) { bool IsStrict = N->isStrictFPOpcode(); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 09d47caeef471..e73b6b1a826cf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -560,6 +560,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftenFloatRes_FNEG(SDNode *N); SDValue SoftenFloatRes_FP_EXTEND(SDNode *N); SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N); + SDValue SoftenFloatRes_BF16_TO_FP(SDNode *N); SDValue SoftenFloatRes_FP_ROUND(SDNode *N); SDValue SoftenFloatRes_FPOW(SDNode *N); SDValue SoftenFloatRes_FPOWI(SDNode *N); diff --git a/llvm/test/CodeGen/RISCV/bfloat.ll b/llvm/test/CodeGen/RISCV/bfloat.ll new file mode 100644 index 0000000000000..e7583a595ff06 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/bfloat.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s | FileCheck %s -check-prefix=RV32I-ILP32 +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s -check-prefix=RV64I-LP64 + +; TODO: Enable codegen for hard float. + +define bfloat @float_to_bfloat(float %a) nounwind { +; RV32I-ILP32-LABEL: float_to_bfloat: +; RV32I-ILP32: # %bb.0: +; RV32I-ILP32-NEXT: addi sp, sp, -16 +; RV32I-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-ILP32-NEXT: call __truncsfbf2@plt +; RV32I-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-ILP32-NEXT: addi sp, sp, 16 +; RV32I-ILP32-NEXT: ret +; +; RV64I-LP64-LABEL: float_to_bfloat: +; RV64I-LP64: # %bb.0: +; RV64I-LP64-NEXT: addi sp, sp, -16 +; RV64I-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-LP64-NEXT: call __truncsfbf2@plt +; RV64I-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-LP64-NEXT: addi sp, sp, 16 +; RV64I-LP64-NEXT: ret + %1 = fptrunc float %a to bfloat + ret bfloat %1 +} + +define bfloat @double_to_bfloat(double %a) nounwind { +; RV32I-ILP32-LABEL: double_to_bfloat: +; RV32I-ILP32: # %bb.0: +; RV32I-ILP32-NEXT: addi sp, sp, -16 +; RV32I-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-ILP32-NEXT: call __truncdfbf2@plt +; RV32I-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-ILP32-NEXT: addi sp, sp, 16 +; RV32I-ILP32-NEXT: ret +; +; RV64I-LP64-LABEL: double_to_bfloat: +; RV64I-LP64: # %bb.0: +; RV64I-LP64-NEXT: addi sp, sp, -16 +; RV64I-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-LP64-NEXT: call __truncdfbf2@plt +; RV64I-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-LP64-NEXT: addi sp, sp, 16 +; RV64I-LP64-NEXT: ret + %1 = fptrunc double %a to bfloat + ret bfloat %1 +} + +define float @bfloat_to_float(bfloat %a) nounwind { +; RV32I-ILP32-LABEL: bfloat_to_float: +; RV32I-ILP32: # %bb.0: +; RV32I-ILP32-NEXT: slli a0, a0, 16 +; RV32I-ILP32-NEXT: ret +; +; RV64I-LP64-LABEL: bfloat_to_float: +; RV64I-LP64: # %bb.0: +; RV64I-LP64-NEXT: slliw a0, a0, 16 +; RV64I-LP64-NEXT: ret + %1 = fpext bfloat %a to float + ret float %1 +} + +define double @bfloat_to_double(bfloat %a) nounwind { +; RV32I-ILP32-LABEL: bfloat_to_double: +; RV32I-ILP32: # %bb.0: +; RV32I-ILP32-NEXT: addi sp, sp, -16 +; RV32I-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-ILP32-NEXT: slli a0, a0, 16 +; RV32I-ILP32-NEXT: call __extendsfdf2@plt +; RV32I-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-ILP32-NEXT: addi sp, sp, 16 +; RV32I-ILP32-NEXT: ret +; +; RV64I-LP64-LABEL: bfloat_to_double: +; RV64I-LP64: # %bb.0: +; RV64I-LP64-NEXT: addi sp, sp, -16 +; RV64I-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-LP64-NEXT: slli a0, a0, 48 +; RV64I-LP64-NEXT: srli a0, a0, 32 +; RV64I-LP64-NEXT: call __extendsfdf2@plt +; RV64I-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-LP64-NEXT: addi sp, sp, 16 +; RV64I-LP64-NEXT: ret + %1 = fpext bfloat %a to double + ret double %1 +} + +define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind { +; RV32I-ILP32-LABEL: bfloat_add: +; RV32I-ILP32: # %bb.0: +; RV32I-ILP32-NEXT: addi sp, sp, -16 +; RV32I-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-ILP32-NEXT: slli a0, a0, 16 +; RV32I-ILP32-NEXT: slli a1, a1, 16 +; RV32I-ILP32-NEXT: call __addsf3@plt +; RV32I-ILP32-NEXT: call __truncsfbf2@plt +; RV32I-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-ILP32-NEXT: addi sp, sp, 16 +; RV32I-ILP32-NEXT: ret +; +; RV64I-LP64-LABEL: bfloat_add: +; RV64I-LP64: # %bb.0: +; RV64I-LP64-NEXT: addi sp, sp, -16 +; RV64I-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-LP64-NEXT: slliw a0, a0, 16 +; RV64I-LP64-NEXT: slliw a1, a1, 16 +; RV64I-LP64-NEXT: call __addsf3@plt +; RV64I-LP64-NEXT: call __truncsfbf2@plt +; RV64I-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-LP64-NEXT: addi sp, sp, 16 +; RV64I-LP64-NEXT: ret + %1 = fadd bfloat %a, %b + ret bfloat %1 +} From f2a866170c4961137608eee1c26f6eaa1e8e62a1 Mon Sep 17 00:00:00 2001 From: Muhammad Omair Javaid Date: Mon, 29 May 2023 16:14:51 +0400 Subject: [PATCH 002/704] [LLDB] Fix TestVarPath.py for AArch64 Windows Since 44bb442 LLDB TestVarPath.py crashes on AArch64 Windows. GetValueForVariablePath function seems to be triggering the crash. This patch disable parts of this test causing the crash. Bug reported upstream: https://github.com/llvm/llvm-project/issues/62983 --- lldb/test/API/functionalities/var_path/TestVarPath.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lldb/test/API/functionalities/var_path/TestVarPath.py b/lldb/test/API/functionalities/var_path/TestVarPath.py index 06b4a2d8da313..f79b2a7584ae9 100644 --- a/lldb/test/API/functionalities/var_path/TestVarPath.py +++ b/lldb/test/API/functionalities/var_path/TestVarPath.py @@ -5,6 +5,7 @@ import lldb import lldbsuite.test.lldbutil as lldbutil +from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -115,7 +116,7 @@ def do_test(self): self.assertSuccess(v.GetError(), "Make sure we find 'pt_sp'") # Make sure we don't crash when looking for non existant child # in type with synthetic children. This used to cause a crash. - v = frame.GetValueForVariablePath("pt_sp->not_valid_child") - self.assertTrue( - v.GetError().Fail(), "Make sure we don't find 'pt_sp->not_valid_child'" - ) + if not self.isAArch64Windows(): + v = frame.GetValueForVariablePath("pt_sp->not_valid_child") + self.assertTrue(v.GetError().Fail(), + "Make sure we don't find 'pt_sp->not_valid_child'") From ab05d9134d18db34501985a01fbfc02609767587 Mon Sep 17 00:00:00 2001 From: tcwg Date: Mon, 29 May 2023 13:57:26 +0100 Subject: [PATCH 003/704] Revert "[LLDB] Add/Remove xfail for some API tests on Windows" This reverts commit 6ea1a0d4fc3823de143a288df2059b48dc01cf72. It again marks XFAIL LLDB tests failing after c384fcd3ea1dad782eaaea89b32fc33c0c3528b8 --- .../save_jit_objects/TestSaveJITObjects.py | 1 + .../breakpoint/scripted_bkpt/TestScriptedResolver.py | 3 +++ .../inline-stepping/TestInlineStepping.py | 3 ++- .../step-avoids-no-debug/TestStepNoDebug.py | 12 +++++++----- lldb/test/API/lang/c/step-target/TestStepTarget.py | 4 ++++ .../cpp/global_variables/TestCPPGlobalVariables.py | 1 + lldb/test/API/lang/cpp/namespace/TestNamespace.py | 1 - .../lang/cpp/static_members/TestCPPStaticMembers.py | 2 ++ .../API/python_api/function_symbol/TestDisasmAPI.py | 1 + .../API/python_api/function_symbol/TestSymbolAPI.py | 1 + .../python_api/symbol-context/TestSymbolContext.py | 1 + lldb/test/API/python_api/target/TestTargetAPI.py | 2 ++ lldb/test/API/python_api/value/TestValueAPI.py | 1 + 13 files changed, 26 insertions(+), 7 deletions(-) diff --git a/lldb/test/API/commands/expression/save_jit_objects/TestSaveJITObjects.py b/lldb/test/API/commands/expression/save_jit_objects/TestSaveJITObjects.py index 48377a75d23bb..438b92cdc4846 100644 --- a/lldb/test/API/commands/expression/save_jit_objects/TestSaveJITObjects.py +++ b/lldb/test/API/commands/expression/save_jit_objects/TestSaveJITObjects.py @@ -22,6 +22,7 @@ def cleanJITFiles(self): os.remove(j) return + @expectedFailureAll(oslist=["windows"]) def test_save_jit_objects(self): self.build() os.chdir(self.getBuildDir()) diff --git a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py index 745700a14a3f0..9f477f951cd86 100644 --- a/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py +++ b/lldb/test/API/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py @@ -12,17 +12,20 @@ class TestScriptedResolver(TestBase): NO_DEBUG_INFO_TESTCASE = True + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528") def test_scripted_resolver(self): """Use a scripted resolver to set a by symbol name breakpoint""" self.build() self.do_test() + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528") def test_search_depths(self): """Make sure we are called at the right depths depending on what we return from __get_depth__""" self.build() self.do_test_depths() + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528") def test_command_line(self): """Test setting a resolver breakpoint from the command line""" self.build() diff --git a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py index ae4847bb788fe..49a72c2863e74 100644 --- a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py +++ b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py @@ -25,7 +25,8 @@ def test_step_over_with_python_api(self): self.build() self.inline_stepping_step_over() - @add_test_categories(["pyapi"]) + @add_test_categories(['pyapi']) + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343") def test_step_in_template_with_python_api(self): """Test stepping in to templated functions.""" self.build() diff --git a/lldb/test/API/functionalities/step-avoids-no-debug/TestStepNoDebug.py b/lldb/test/API/functionalities/step-avoids-no-debug/TestStepNoDebug.py index 1d4a0008f6b44..6c399b8a0a1b2 100644 --- a/lldb/test/API/functionalities/step-avoids-no-debug/TestStepNoDebug.py +++ b/lldb/test/API/functionalities/step-avoids-no-debug/TestStepNoDebug.py @@ -10,7 +10,9 @@ class StepAvoidsNoDebugTestCase(TestBase): - @add_test_categories(["pyapi"]) + + @add_test_categories(['pyapi']) + @expectedFailureAll(archs=["aarch64"], oslist=["windows"], bugnumber="llvm.org/pr56292") def test_step_out_with_python(self): """Test stepping out using avoid-no-debug with dsyms.""" self.build() @@ -24,8 +26,8 @@ def test_step_out_with_python(self): compiler_version=[">=", "3.9"], archs=["i386"], oslist=no_match(["freebsd"]), - bugnumber="llvm.org/pr28549", - ) + bugnumber="llvm.org/pr28549") + @expectedFailureAll(archs=["aarch64"], oslist=["windows"], bugnumber="llvm.org/pr56292") def test_step_over_with_python(self): """Test stepping over using avoid-no-debug with dwarf.""" self.build() @@ -39,8 +41,8 @@ def test_step_over_with_python(self): compiler_version=[">=", "3.9"], archs=["i386"], oslist=no_match(["freebsd"]), - bugnumber="llvm.org/pr28549", - ) + bugnumber="llvm.org/pr28549") + @expectedFailureAll(archs=["aarch64"], oslist=["windows"], bugnumber="llvm.org/pr56292") def test_step_in_with_python(self): """Test stepping in using avoid-no-debug with dwarf.""" self.build() diff --git a/lldb/test/API/lang/c/step-target/TestStepTarget.py b/lldb/test/API/lang/c/step-target/TestStepTarget.py index 457dba15e2ca0..2da0a7894655d 100644 --- a/lldb/test/API/lang/c/step-target/TestStepTarget.py +++ b/lldb/test/API/lang/c/step-target/TestStepTarget.py @@ -45,6 +45,7 @@ def get_to_start(self): thread = threads[0] return thread + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343") def test_with_end_line(self): """Test stepping over vrs. hitting breakpoints & subsequent stepping in various forms.""" @@ -56,6 +57,7 @@ def test_with_end_line(self): self.assertEqual(frame.name, "lotsOfArgs", "Stepped to lotsOfArgs.") + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343") def test_with_end_line_bad_name(self): """Test stepping over vrs. hitting breakpoints & subsequent stepping in various forms.""" @@ -78,6 +80,7 @@ def test_with_end_line_deeper(self): frame = thread.frames[0] self.assertEqual(frame.name, "modifyInt", "Stepped to modifyInt.") + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343") def test_with_command_and_block(self): """Test stepping over vrs. hitting breakpoints & subsequent stepping in various forms.""" @@ -92,6 +95,7 @@ def test_with_command_and_block(self): frame = thread.frames[0] self.assertEqual(frame.name, "lotsOfArgs", "Stepped to lotsOfArgs.") + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343") def test_with_command_and_block_and_bad_name(self): """Test stepping over vrs. hitting breakpoints & subsequent stepping in various forms.""" diff --git a/lldb/test/API/lang/cpp/global_variables/TestCPPGlobalVariables.py b/lldb/test/API/lang/cpp/global_variables/TestCPPGlobalVariables.py index 1a811130a7eac..9d964634adbbd 100644 --- a/lldb/test/API/lang/cpp/global_variables/TestCPPGlobalVariables.py +++ b/lldb/test/API/lang/cpp/global_variables/TestCPPGlobalVariables.py @@ -75,6 +75,7 @@ def test(self): ) self.assertEqual(var.GetValue(), "100") + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24764") def test_access_by_mangled_name(self): self.build() diff --git a/lldb/test/API/lang/cpp/namespace/TestNamespace.py b/lldb/test/API/lang/cpp/namespace/TestNamespace.py index 1dc9d00fcd993..960cdac06deae 100644 --- a/lldb/test/API/lang/cpp/namespace/TestNamespace.py +++ b/lldb/test/API/lang/cpp/namespace/TestNamespace.py @@ -37,7 +37,6 @@ def test_breakpoints_func_auto(self): ) @expectedFailureAll(bugnumber="llvm.org/pr28548", compiler="gcc") - @expectedFailureAll(oslist=["windows"]) def test_breakpoints_func_full(self): """Test that we can set breakpoints correctly by fullname to find all functions whose fully qualified name is "func" (no namespaces).""" diff --git a/lldb/test/API/lang/cpp/static_members/TestCPPStaticMembers.py b/lldb/test/API/lang/cpp/static_members/TestCPPStaticMembers.py index 6eb5d46b5d97c..33685d61b7168 100644 --- a/lldb/test/API/lang/cpp/static_members/TestCPPStaticMembers.py +++ b/lldb/test/API/lang/cpp/static_members/TestCPPStaticMembers.py @@ -44,6 +44,8 @@ def test_access_without_scope(self): startstr="error: use of undeclared identifier 's_d'", ) + # We fail to lookup static members on Windows. + @expectedFailureAll(oslist=["windows"]) def test_no_crash_in_IR_arithmetic(self): """ Test that LLDB doesn't crash on evaluating specific expression involving diff --git a/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py b/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py index 75b9082f7ab1b..572d76e17c768 100644 --- a/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py +++ b/lldb/test/API/python_api/function_symbol/TestDisasmAPI.py @@ -20,6 +20,7 @@ def setUp(self): "main.c", "// Find the line number for breakpoint 2 here." ) + @expectedFailureAll(oslist=["windows"], bugnumber='llvm.org/pr21765') def test(self): """Exercise getting SBAddress objects, disassembly, and SBAddress APIs.""" self.build() diff --git a/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py b/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py index fb6073bbd26ee..04c807bbb76a5 100644 --- a/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py +++ b/lldb/test/API/python_api/function_symbol/TestSymbolAPI.py @@ -20,6 +20,7 @@ def setUp(self): "main.c", "// Find the line number for breakpoint 2 here." ) + @expectedFailureAll(oslist=["windows"], bugnumber='llvm.org/pr21765') def test(self): """Exercise some SBSymbol and SBAddress APIs.""" self.build() diff --git a/lldb/test/API/python_api/symbol-context/TestSymbolContext.py b/lldb/test/API/python_api/symbol-context/TestSymbolContext.py index 7674d10d771fa..4c125779f60bb 100644 --- a/lldb/test/API/python_api/symbol-context/TestSymbolContext.py +++ b/lldb/test/API/python_api/symbol-context/TestSymbolContext.py @@ -17,6 +17,7 @@ def setUp(self): "main.c", '// Find the line number of function "c" here.' ) + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24778") def test(self): """Exercise SBSymbolContext API extensively.""" self.build() diff --git a/lldb/test/API/python_api/target/TestTargetAPI.py b/lldb/test/API/python_api/target/TestTargetAPI.py index c1fed81dd8c55..c9e7e80a400b6 100644 --- a/lldb/test/API/python_api/target/TestTargetAPI.py +++ b/lldb/test/API/python_api/target/TestTargetAPI.py @@ -42,6 +42,7 @@ def test_find_compile_units(self): self.setTearDownCleanup(dictionary=d) self.find_compile_units(self.getBuildArtifact("b.out")) + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24778") def test_find_functions(self): """Exercise SBTarget.FindFunctions() API.""" d = {"EXE": "b.out"} @@ -54,6 +55,7 @@ def test_get_description(self): self.build() self.get_description() + @expectedFailureAll(oslist=["windows"], bugnumber='llvm.org/pr21765') def test_resolve_symbol_context_with_address(self): """Exercise SBTarget.ResolveSymbolContextForAddress() API.""" self.build() diff --git a/lldb/test/API/python_api/value/TestValueAPI.py b/lldb/test/API/python_api/value/TestValueAPI.py index 57139f7d2d84b..dc68eb6c5748d 100644 --- a/lldb/test/API/python_api/value/TestValueAPI.py +++ b/lldb/test/API/python_api/value/TestValueAPI.py @@ -17,6 +17,7 @@ def setUp(self): # Find the line number to of function 'c'. self.line = line_number("main.c", "// Break at this line") + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24772") def test(self): """Exercise some SBValue APIs.""" d = {"EXE": self.exe_name} From cd2fc73b49851540b06f91e89a42bdc5affa7e49 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 29 May 2023 15:44:35 +0100 Subject: [PATCH 004/704] Revert "[ValueTracking][InstCombine] Add a new API to allow to ignore poison generating flags or metadatas when implying poison" This reverts commit 754f3ae65518331b7175d7a9b4a124523ebe6eac. Unfortunately the change can cause regressions due to dropping flags from instructions (like nuw,nsw,inbounds), prevent further optimizations depending on those flags. A simple example is the IR below, where `inbounds` is dropped with the patch and the phase-ordering test added in 7c91d82ab912fae8b. define i1 @test(ptr %base, i64 noundef %len, ptr %p2) { bb: %gep = getelementptr inbounds i32, ptr %base, i64 %len %c.1 = icmp uge ptr %p2, %base %c.2 = icmp ult ptr %p2, %gep %select = select i1 %c.1, i1 %c.2, i1 false ret i1 %select } For more discussion, see D149404. --- llvm/include/llvm/Analysis/ValueTracking.h | 7 ---- llvm/lib/Analysis/ValueTracking.cpp | 34 +++++------------ .../InstCombine/InstCombineSelect.cpp | 37 +++++-------------- llvm/test/Transforms/InstCombine/ispow2.ll | 30 ++++++--------- .../InstCombine/prevent-cmp-merge.ll | 4 +- .../iterator-with-runtime-check.ll | 5 +-- 6 files changed, 35 insertions(+), 82 deletions(-) diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index 4132654ac94d0..48bd1ee228b9f 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -946,13 +946,6 @@ bool canCreatePoison(const Operator *Op, bool ConsiderFlagsAndMetadata = true); /// impliesPoison returns true. bool impliesPoison(const Value *ValAssumedPoison, const Value *V); -/// Return true if V is poison given that ValAssumedPoison is already poison. -/// Poison generating flags or metadata are ignored in the process of implying. -/// And the ignored instructions will be recorded in IgnoredInsts. -bool impliesPoisonIgnoreFlagsOrMetadata( - Value *ValAssumedPoison, const Value *V, - SmallVectorImpl &IgnoredInsts); - /// Return true if this function can prove that V does not have undef bits /// and is never poison. If V is an aggregate value or vector, check whether /// all elements (except padding) are not undef or poison. diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 7ec34cdca0be5..fc15fb8c02726 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -6599,9 +6599,8 @@ static bool directlyImpliesPoison(const Value *ValAssumedPoison, return false; } -static bool -impliesPoison(Value *ValAssumedPoison, const Value *V, unsigned Depth, - SmallVectorImpl *IgnoredInsts = nullptr) { +static bool impliesPoison(const Value *ValAssumedPoison, const Value *V, + unsigned Depth) { if (isGuaranteedNotToBePoison(ValAssumedPoison)) return true; @@ -6612,30 +6611,17 @@ impliesPoison(Value *ValAssumedPoison, const Value *V, unsigned Depth, if (Depth >= MaxDepth) return false; - auto *I = dyn_cast(ValAssumedPoison); - if (!I || canCreatePoison(cast(I), - /*ConsiderFlagsAndMetadata*/ !IgnoredInsts)) - return false; - - for (Value *Op : I->operands()) - if (!impliesPoison(Op, V, Depth + 1, IgnoredInsts)) - return false; - - if (IgnoredInsts && I->hasPoisonGeneratingFlagsOrMetadata()) - IgnoredInsts->push_back(I); - - return true; + const auto *I = dyn_cast(ValAssumedPoison); + if (I && !canCreatePoison(cast(I))) { + return all_of(I->operands(), [=](const Value *Op) { + return impliesPoison(Op, V, Depth + 1); + }); + } + return false; } bool llvm::impliesPoison(const Value *ValAssumedPoison, const Value *V) { - return ::impliesPoison(const_cast(ValAssumedPoison), V, - /* Depth */ 0); -} - -bool llvm::impliesPoisonIgnoreFlagsOrMetadata( - Value *ValAssumedPoison, const Value *V, - SmallVectorImpl &IgnoredInsts) { - return ::impliesPoison(ValAssumedPoison, V, /* Depth */ 0, &IgnoredInsts); + return ::impliesPoison(ValAssumedPoison, V, /* Depth */ 0); } static bool programUndefinedIfUndefOrPoison(const Value *V, diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 1b29304338092..32b3c56dc9a21 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -2924,32 +2924,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { auto *Zero = ConstantInt::getFalse(SelType); Value *A, *B, *C, *D; - auto dropPoisonGeneratingFlagsAndMetadata = - [](ArrayRef Insts) { - for (auto *I : Insts) - I->dropPoisonGeneratingFlagsAndMetadata(); - }; // Folding select to and/or i1 isn't poison safe in general. impliesPoison // checks whether folding it does not convert a well-defined value into // poison. if (match(TrueVal, m_One())) { + if (impliesPoison(FalseVal, CondVal)) { + // Change: A = select B, true, C --> A = or B, C + return BinaryOperator::CreateOr(CondVal, FalseVal); + } + if (auto *LHS = dyn_cast(CondVal)) if (auto *RHS = dyn_cast(FalseVal)) if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ false, /*IsSelectLogical*/ true)) return replaceInstUsesWith(SI, V); - // Some patterns can be matched by both of the above and following - // combinations. Because we need to drop poison generating - // flags and metadatas for the following combination, it has less priority - // than the above combination. - SmallVector IgnoredInsts; - if (impliesPoisonIgnoreFlagsOrMetadata(FalseVal, CondVal, IgnoredInsts)) { - dropPoisonGeneratingFlagsAndMetadata(IgnoredInsts); - // Change: A = select B, true, C --> A = or B, C - return BinaryOperator::CreateOr(CondVal, FalseVal); - } - // (A && B) || (C && B) --> (A || C) && B if (match(CondVal, m_LogicalAnd(m_Value(A), m_Value(B))) && match(FalseVal, m_LogicalAnd(m_Value(C), m_Value(D))) && @@ -2980,23 +2969,17 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) { } if (match(FalseVal, m_Zero())) { + if (impliesPoison(TrueVal, CondVal)) { + // Change: A = select B, C, false --> A = and B, C + return BinaryOperator::CreateAnd(CondVal, TrueVal); + } + if (auto *LHS = dyn_cast(CondVal)) if (auto *RHS = dyn_cast(TrueVal)) if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ true, /*IsSelectLogical*/ true)) return replaceInstUsesWith(SI, V); - // Some patterns can be matched by both of the above and following - // combinations. Because we need to drop poison generating - // flags and metadatas for the following combination, it has less priority - // than the above combination. - SmallVector IgnoredInsts; - if (impliesPoisonIgnoreFlagsOrMetadata(TrueVal, CondVal, IgnoredInsts)) { - dropPoisonGeneratingFlagsAndMetadata(IgnoredInsts); - // Change: A = select B, C, false --> A = and B, C - return BinaryOperator::CreateAnd(CondVal, TrueVal); - } - // (A || B) && (C || B) --> (A && C) || B if (match(CondVal, m_LogicalOr(m_Value(A), m_Value(B))) && match(TrueVal, m_LogicalOr(m_Value(C), m_Value(D))) && diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll index ce178c29bba2a..191ff9f005a5d 100644 --- a/llvm/test/Transforms/InstCombine/ispow2.ll +++ b/llvm/test/Transforms/InstCombine/ispow2.ll @@ -282,7 +282,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) { ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 3 ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[NOTZERO]], i1 [[CMP]], i1 false ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -314,7 +314,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) { ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 2 ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 1 -; CHECK-NEXT: [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[NOTZERO]], i1 [[CMP]], i1 false ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -346,7 +346,7 @@ define i1 @is_pow2_ctpop_wrong_pred1_logical(i32 %x) { ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 2 ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[NOTZERO]], i1 [[CMP]], i1 false ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -378,7 +378,7 @@ define i1 @is_pow2_ctpop_wrong_pred2_logical(i32 %x) { ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[T0]], 2 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = and i1 [[CMP2]], [[CMP]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP2]], i1 [[CMP]], i1 false ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -493,7 +493,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) { ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 2 ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = or i1 [[ISZERO]], [[CMP]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[ISZERO]], i1 true, i1 [[CMP]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -525,7 +525,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) { ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 1 ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 1 -; CHECK-NEXT: [[R:%.*]] = or i1 [[ISZERO]], [[CMP]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[ISZERO]], i1 true, i1 [[CMP]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -557,7 +557,7 @@ define i1 @isnot_pow2_ctpop_wrong_pred2_logical(i32 %x) { ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[T0]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = or i1 [[CMP2]], [[CMP]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[CMP2]], i1 true, i1 [[CMP]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -855,7 +855,7 @@ define i1 @is_pow2or0_ctpop_wrong_cmp_op1_logical(i32 %x) { ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[T0]], 3 ; CHECK-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = or i1 [[ISZERO]], [[CMP]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[ISZERO]], i1 true, i1 [[CMP]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -914,11 +914,7 @@ define i1 @is_pow2or0_ctpop_wrong_pred2(i32 %x) { define i1 @is_pow2or0_ctpop_wrong_pred2_logical(i32 %x) { ; CHECK-LABEL: @is_pow2or0_ctpop_wrong_pred2_logical( -; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[T0]], 1 -; CHECK-NEXT: [[ISZERO:%.*]] = icmp ne i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = or i1 [[ISZERO]], [[CMP]] -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 true ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) %cmp = icmp ne i32 %t0, 1 @@ -1062,7 +1058,7 @@ define i1 @isnot_pow2nor0_ctpop_wrong_cmp_op1_logical(i32 %x) { ; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[T0]], 5 ; CHECK-NEXT: [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]] +; CHECK-NEXT: [[R:%.*]] = select i1 [[NOTZERO]], i1 [[CMP]], i1 false ; CHECK-NEXT: ret i1 [[R]] ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -1121,11 +1117,7 @@ define i1 @isnot_pow2nor0_ctpop_wrong_pred2(i32 %x) { define i1 @isnot_pow2nor0_ctpop_wrong_pred2_logical(i32 %x) { ; CHECK-LABEL: @isnot_pow2nor0_ctpop_wrong_pred2_logical( -; CHECK-NEXT: [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]] -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[T0]], 1 -; CHECK-NEXT: [[NOTZERO:%.*]] = icmp eq i32 [[X]], 0 -; CHECK-NEXT: [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]] -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 false ; %t0 = tail call i32 @llvm.ctpop.i32(i32 %x) %cmp = icmp eq i32 %t0, 1 diff --git a/llvm/test/Transforms/InstCombine/prevent-cmp-merge.ll b/llvm/test/Transforms/InstCombine/prevent-cmp-merge.ll index a24ae9b9c57b9..cd05022b0d35d 100644 --- a/llvm/test/Transforms/InstCombine/prevent-cmp-merge.ll +++ b/llvm/test/Transforms/InstCombine/prevent-cmp-merge.ll @@ -71,10 +71,10 @@ define zeroext i1 @test3(i32 %lhs, i32 %rhs) { define zeroext i1 @test3_logical(i32 %lhs, i32 %rhs) { ; CHECK-LABEL: @test3_logical( -; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[LHS:%.*]], [[RHS:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[LHS:%.*]], [[RHS:%.*]] ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[LHS]], [[RHS]] ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[SUB]], 31 -; CHECK-NEXT: [[SEL:%.*]] = or i1 [[CMP1]], [[CMP2]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP2]] ; CHECK-NEXT: ret i1 [[SEL]] ; diff --git a/llvm/test/Transforms/PhaseOrdering/iterator-with-runtime-check.ll b/llvm/test/Transforms/PhaseOrdering/iterator-with-runtime-check.ll index ebe507d8e9c35..23b1b2b3cd87d 100644 --- a/llvm/test/Transforms/PhaseOrdering/iterator-with-runtime-check.ll +++ b/llvm/test/Transforms/PhaseOrdering/iterator-with-runtime-check.ll @@ -24,12 +24,11 @@ define void @test_fill_with_foreach([2 x i64] %elems.coerce) { ; CHECK-NEXT: [[ELEMS_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[ELEMS_COERCE]], 0 ; CHECK-NEXT: [[TMP0:%.*]] = inttoptr i64 [[ELEMS_COERCE_FCA_0_EXTRACT]] to ptr ; CHECK-NEXT: [[ELEMS_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[ELEMS_COERCE]], 1 -; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[ELEMS_COERCE_FCA_1_EXTRACT]] +; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[ELEMS_COERCE_FCA_1_EXTRACT]] ; CHECK-NEXT: [[CMP_NOT_I_I_I_I:%.*]] = icmp slt i64 [[ELEMS_COERCE_FCA_1_EXTRACT]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT_I_I_I_I]], label [[ERROR:%.*]], label [[FOR_COND_PREHEADER:%.*]] ; CHECK: for.cond.preheader: -; CHECK-NEXT: [[ADD_PTR_I_IDX_MASK:%.*]] = and i64 [[ELEMS_COERCE_FCA_1_EXTRACT]], 4611686018427387903 -; CHECK-NEXT: [[CMP_I_NOT2:%.*]] = icmp eq i64 [[ADD_PTR_I_IDX_MASK]], 0 +; CHECK-NEXT: [[CMP_I_NOT2:%.*]] = icmp eq i64 [[ELEMS_COERCE_FCA_1_EXTRACT]], 0 ; CHECK-NEXT: br i1 [[CMP_I_NOT2]], label [[COMMON_RET:%.*]], label [[FOR_BODY:%.*]] ; CHECK: common.ret: ; CHECK-NEXT: ret void From 7fb60b0123e50389afbde0286a0e59923d154210 Mon Sep 17 00:00:00 2001 From: Muhammad Omair Javaid Date: Mon, 29 May 2023 18:53:23 +0400 Subject: [PATCH 005/704] [LLDB] Add XFAIL on AArch64/Windows to TestNamespace.py --- lldb/test/API/lang/cpp/namespace/TestNamespace.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/API/lang/cpp/namespace/TestNamespace.py b/lldb/test/API/lang/cpp/namespace/TestNamespace.py index 960cdac06deae..1dc9d00fcd993 100644 --- a/lldb/test/API/lang/cpp/namespace/TestNamespace.py +++ b/lldb/test/API/lang/cpp/namespace/TestNamespace.py @@ -37,6 +37,7 @@ def test_breakpoints_func_auto(self): ) @expectedFailureAll(bugnumber="llvm.org/pr28548", compiler="gcc") + @expectedFailureAll(oslist=["windows"]) def test_breakpoints_func_full(self): """Test that we can set breakpoints correctly by fullname to find all functions whose fully qualified name is "func" (no namespaces).""" From 98061013e01207444cfd3980cde17b5e75764fbe Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 27 May 2023 17:59:19 +0100 Subject: [PATCH 006/704] [X86] X86FixupVectorConstantsPass - attempt to replace full width fp vector constant loads with broadcasts on AVX+ targets lowerBuildVectorAsBroadcast will not broadcast splat constants in all cases, resulting in a lot of situations where a full width vector load that has failed to fold but is loading splat constant values could use a broadcast load instruction just as cheaply, and save constant pool space. NOTE: SSE3 targets can use MOVDDUP but not all SSE era CPUs can perform this as cheaply as a vector load, we will need to add scheduler model checks if we want to pursue this. --- .../Target/X86/X86FixupVectorConstants.cpp | 47 ++++- llvm/test/CodeGen/X86/avx-basic.ll | 2 +- llvm/test/CodeGen/X86/avx-vbroadcast.ll | 4 +- llvm/test/CodeGen/X86/avx2-conversions.ll | 6 +- llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll | 56 +++--- llvm/test/CodeGen/X86/avx2-vbroadcast.ll | 4 +- llvm/test/CodeGen/X86/avx512-regcall-Mask.ll | 5 +- .../X86/avx512-shuffles/partial_permute.ll | 15 +- llvm/test/CodeGen/X86/bitreverse.ll | 4 +- .../X86/broadcast-elm-cross-splat-vec.ll | 4 +- llvm/test/CodeGen/X86/cast-vsel.ll | 4 +- llvm/test/CodeGen/X86/combine-and.ll | 38 +--- llvm/test/CodeGen/X86/combine-sdiv.ll | 18 +- llvm/test/CodeGen/X86/combine-udiv.ll | 15 +- llvm/test/CodeGen/X86/extractelement-load.ll | 82 +++------ llvm/test/CodeGen/X86/fma-fneg-combine-2.ll | 4 +- .../CodeGen/X86/fma-intrinsics-fast-isel.ll | 9 +- llvm/test/CodeGen/X86/fma_patterns.ll | 42 ++--- llvm/test/CodeGen/X86/fma_patterns_wide.ll | 60 +++---- llvm/test/CodeGen/X86/fminimum-fmaximum.ll | 36 ++-- .../test/CodeGen/X86/fold-vector-sext-zext.ll | 24 ++- .../CodeGen/X86/fold-vector-trunc-sitofp.ll | 3 +- llvm/test/CodeGen/X86/fp-round.ll | 8 +- .../X86/insert-into-constant-vector.ll | 6 +- llvm/test/CodeGen/X86/known-bits-vector.ll | 8 +- llvm/test/CodeGen/X86/masked_store_trunc.ll | 10 +- .../CodeGen/X86/masked_store_trunc_usat.ll | 30 ++-- llvm/test/CodeGen/X86/memset-nonzero.ll | 22 +-- .../test/CodeGen/X86/merge-store-constants.ll | 4 +- llvm/test/CodeGen/X86/oddshuffles.ll | 8 +- llvm/test/CodeGen/X86/paddus.ll | 4 +- llvm/test/CodeGen/X86/pr30290.ll | 2 +- llvm/test/CodeGen/X86/pr32368.ll | 4 +- llvm/test/CodeGen/X86/pr38639.ll | 5 +- llvm/test/CodeGen/X86/psubus.ll | 11 +- llvm/test/CodeGen/X86/recip-fastmath.ll | 166 +++++------------- llvm/test/CodeGen/X86/recip-fastmath2.ll | 48 ++--- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 13 +- llvm/test/CodeGen/X86/sat-add.ll | 3 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 3 +- llvm/test/CodeGen/X86/splat-const.ll | 2 +- llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll | 4 +- llvm/test/CodeGen/X86/sqrt-fastmath.ll | 53 +++--- llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll | 36 +--- llvm/test/CodeGen/X86/sse2.ll | 37 ++-- llvm/test/CodeGen/X86/sshl_sat_vec.ll | 6 +- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 13 +- llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll | 18 +- llvm/test/CodeGen/X86/v8i1-masks.ll | 56 +++--- .../CodeGen/X86/vec-strict-fptoint-128.ll | 4 +- .../CodeGen/X86/vec-strict-fptoint-256.ll | 8 +- llvm/test/CodeGen/X86/vec_anyext.ll | 6 +- llvm/test/CodeGen/X86/vec_fabs.ll | 8 +- llvm/test/CodeGen/X86/vec_fp_to_int.ll | 6 +- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 23 ++- .../X86/vector-constrained-fp-intrinsics.ll | 74 ++++---- llvm/test/CodeGen/X86/vector-fshl-256.ll | 4 +- llvm/test/CodeGen/X86/vector-fshr-256.ll | 4 +- .../vector-interleaved-load-i32-stride-3.ll | 3 +- .../vector-interleaved-load-i32-stride-4.ll | 60 ++++--- .../vector-interleaved-load-i32-stride-6.ll | 104 +++++++---- .../vector-interleaved-load-i32-stride-7.ll | 30 ++-- .../vector-interleaved-store-i32-stride-3.ll | 5 +- .../vector-interleaved-store-i32-stride-5.ll | 2 +- .../vector-interleaved-store-i32-stride-6.ll | 3 +- .../vector-interleaved-store-i32-stride-7.ll | 18 +- .../vector-interleaved-store-i32-stride-8.ll | 12 +- .../CodeGen/X86/vector-reduce-add-mask.ll | 12 +- .../CodeGen/X86/vector-reduce-xor-bool.ll | 2 +- .../CodeGen/X86/vector-shuffle-256-v32.ll | 4 +- .../test/CodeGen/X86/vector-shuffle-256-v8.ll | 5 +- .../test/CodeGen/X86/vector-shuffle-avx512.ll | 3 +- .../X86/vector-shuffle-combining-avx.ll | 6 +- .../CodeGen/X86/vector-shuffle-combining.ll | 49 ++---- llvm/test/CodeGen/X86/vector-trunc-math.ll | 70 ++++---- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 3 +- llvm/test/CodeGen/X86/vector-trunc-usat.ll | 63 ++++--- llvm/test/CodeGen/X86/vector-trunc.ll | 14 +- llvm/test/CodeGen/X86/vselect-avx.ll | 2 +- llvm/test/CodeGen/X86/vselect-zero.ll | 6 +- llvm/test/CodeGen/X86/win_cst_pool.ll | 14 +- 81 files changed, 836 insertions(+), 853 deletions(-) diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index 3e683cb872531..03e474b9e2e18 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -229,7 +229,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, MachineInstr &MI) { unsigned Opc = MI.getOpcode(); - MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool(); + MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool(); + bool HasDQI = ST->hasDQI(); auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64, unsigned OpBcst32, @@ -262,6 +263,50 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, return false; }; + // Attempt to convert full width vector loads into broadcast loads. + switch (Opc) { + /* FP Loads */ + case X86::MOVAPDrm: + case X86::MOVAPSrm: + case X86::MOVUPDrm: + case X86::MOVUPSrm: + // TODO: SSE3 MOVDDUP Handling + return false; + case X86::VMOVAPDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPDrm: + case X86::VMOVUPSrm: + return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0, + 1); + case X86::VMOVAPDYrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPDYrm: + case X86::VMOVUPSYrm: + return ConvertToBroadcast(0, X86::VBROADCASTF128, X86::VBROADCASTSDYrm, + X86::VBROADCASTSSYrm, 0, 0, 1); + case X86::VMOVAPDZ128rm: + case X86::VMOVAPSZ128rm: + case X86::VMOVUPDZ128rm: + case X86::VMOVUPSZ128rm: + return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm, + X86::VBROADCASTSSZ128rm, 0, 0, 1); + case X86::VMOVAPDZ256rm: + case X86::VMOVAPSZ256rm: + case X86::VMOVUPDZ256rm: + case X86::VMOVUPSZ256rm: + return ConvertToBroadcast( + 0, HasDQI ? X86::VBROADCASTF64X2Z128rm : X86::VBROADCASTF32X4Z256rm, + X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm, 0, 0, 1); + case X86::VMOVAPDZrm: + case X86::VMOVAPSZrm: + case X86::VMOVUPDZrm: + case X86::VMOVUPSZrm: + return ConvertToBroadcast( + HasDQI ? X86::VBROADCASTF32X8rm : X86::VBROADCASTF64X4rm, + HasDQI ? X86::VBROADCASTF64X2rm : X86::VBROADCASTF32X4rm, + X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 1); + } + // Attempt to find a AVX512 mapping from a full width memory-fold instruction // to a broadcast-fold instruction variant. if ((MI.getDesc().TSFlags & X86II::EncodingMask) == X86II::EVEX) { diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll index d37d290e55a25..b47f424acc942 100644 --- a/llvm/test/CodeGen/X86/avx-basic.ll +++ b/llvm/test/CodeGen/X86/avx-basic.ll @@ -87,7 +87,7 @@ define <8 x i32> @VMOVZQI2PQI(ptr nocapture %aFOO) nounwind { define <16 x float> @fneg(<16 x float> %a) nounwind { ; CHECK-LABEL: fneg: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index f17cbc31fe66a..54bce767f1fcc 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -300,12 +300,12 @@ entry: define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: _e2: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] +; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X86-NEXT: retl ; ; X64-LABEL: _e2: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] +; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X64-NEXT: retq entry: %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 diff --git a/llvm/test/CodeGen/X86/avx2-conversions.ll b/llvm/test/CodeGen/X86/avx2-conversions.ll index 0dd83eec50fcf..7b35e602cc0fa 100644 --- a/llvm/test/CodeGen/X86/avx2-conversions.ll +++ b/llvm/test/CodeGen/X86/avx2-conversions.ll @@ -16,7 +16,8 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind { ; ; X86-FAST-ALL-LABEL: trunc4: ; X86-FAST-ALL: # %bb.0: -; X86-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X86-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; X86-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; X86-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X86-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; X86-FAST-ALL-NEXT: vzeroupper @@ -38,7 +39,8 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind { ; ; X64-FAST-ALL-LABEL: trunc4: ; X64-FAST-ALL: # %bb.0: -; X64-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; X64-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; X64-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; X64-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; X64-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; X64-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll index 734e56008e083..15e2c3890354f 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -72,30 +72,34 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readn define <32 x i8> @test_x86_avx2_packsswb_fold() { ; X86-AVX-LABEL: test_x86_avx2_packsswb_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packsswb_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> zeroinitializer) ret <32 x i8> %res @@ -121,30 +125,34 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readn define <32 x i8> @test_x86_avx2_packuswb_fold() { ; X86-AVX-LABEL: test_x86_avx2_packuswb_fold: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: # ymm0 = mem[0,1,0,1] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packuswb_fold: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: # ymm0 = mem[0,1,0,1] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vbroadcastf128 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x1a,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> , <16 x i16> zeroinitializer) ret <32 x i8> %res diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll index 70d99e79e1e57..b7516d30df5f6 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -657,12 +657,12 @@ define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp { define <8 x i8> @_e4(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: _e4: ; X86: ## %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52] ; X86-NEXT: retl ; ; X64-LABEL: _e4: ; X64: ## %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52,52,52,52,52,52,52,52,52] ; X64-NEXT: retq %vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0 %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1 diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll index 474f6a9e1948e..34a205a7baa86 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -98,9 +98,10 @@ define dso_local i64 @caller_argv64i1() #0 { ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %edi ; X32-NEXT: subl $88, %esp -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2,1,2,1] +; X32-NEXT: vmovddup {{.*#+}} xmm0 = [2,1,2,1] +; X32-NEXT: # xmm0 = mem[0,0] ; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1] +; X32-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1] ; X32-NEXT: vmovups %zmm0, (%esp) ; X32-NEXT: movl $1, {{[0-9]+}}(%esp) ; X32-NEXT: movl $2, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 0086d05d1ef9c..cc0da34453eb5 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -3630,7 +3630,8 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <0,10,6,15,u,u,u,u> +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,10,6,15,0,10,6,15] +; CHECK-NEXT: # ymm3 = mem[0,1,0,1] ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 @@ -3648,7 +3649,8 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <0,10,6,15,u,u,u,u> +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,10,6,15,0,10,6,15] +; CHECK-NEXT: # ymm1 = mem[0,1,0,1] ; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 ; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} @@ -3892,7 +3894,8 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,3,7,3] +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,3,7,3] +; CHECK-NEXT: # ymm1 = mem[0,1,0,1] ; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; CHECK-NEXT: retq @@ -3902,7 +3905,8 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [7,3,7,3] +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [7,3,7,3] +; CHECK-NEXT: # ymm3 = mem[0,1,0,1] ; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 @@ -3917,7 +3921,8 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [7,3,7,3] +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [7,3,7,3] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index 2203d82907930..bcae88259a92e 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -592,12 +592,12 @@ define <2 x i16> @fold_v2i16() { ; ; X86XOP-LABEL: fold_v2i16: ; X86XOP: # %bb.0: -; X86XOP-NEXT: vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u> +; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240] ; X86XOP-NEXT: retl ; ; GFNI-LABEL: fold_v2i16: ; GFNI: # %bb.0: -; GFNI-NEXT: vmovaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u> +; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240] ; GFNI-NEXT: retq %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> ) ret <2 x i16> %b diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index c0dc8033710ed..94500997987c9 100644 --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -296,7 +296,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) { ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -328,7 +328,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) { ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll index 6b86b7f912ca1..2fd7b34eceec9 100644 --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -194,7 +194,7 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d) ; AVX1-LABEL: trunc: ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 @@ -337,7 +337,7 @@ define dso_local void @example25() nounwind { ; AVX1-LABEL: example25: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB5_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll index 43c85fdc703bf..d223b75419ac4 100644 --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -325,21 +325,11 @@ define <2 x i64> @and_or_v2i64(<2 x i64> %a0) { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,8] ; SSE-NEXT: retq ; -; AVX1-LABEL: and_or_v2i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,8] -; AVX1-NEXT: retq -; -; AVX2-LABEL: and_or_v2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,8] -; AVX2-NEXT: retq -; -; AVX512-LABEL: and_or_v2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovddup {{.*#+}} xmm0 = [8,8] -; AVX512-NEXT: # xmm0 = mem[0,0] -; AVX512-NEXT: retq +; AVX-LABEL: and_or_v2i64: +; AVX: # %bb.0: +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [8,8] +; AVX-NEXT: # xmm0 = mem[0,0] +; AVX-NEXT: retq %1 = or <2 x i64> %a0, %2 = and <2 x i64> %1, ret <2 x i64> %2 @@ -351,20 +341,10 @@ define <4 x i32> @and_or_v4i32(<4 x i32> %a0) { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3] ; SSE-NEXT: retq ; -; AVX1-LABEL: and_or_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [3,3,3,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: and_or_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3] -; AVX2-NEXT: retq -; -; AVX512-LABEL: and_or_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3] -; AVX512-NEXT: retq +; AVX-LABEL: and_or_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3] +; AVX-NEXT: retq %1 = or <4 x i32> %a0, %2 = and <4 x i32> %1, ret <4 x i32> %2 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 0f5f28a857940..bcdcfdd714784 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -147,20 +147,10 @@ define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vec_sdiv_dupe: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] -; AVX1-NEXT: retq -; -; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe: -; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; AVX2ORLATER-NEXT: retq -; -; XOP-LABEL: combine_vec_sdiv_dupe: -; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] -; XOP-NEXT: retq +; AVX-LABEL: combine_vec_sdiv_dupe: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX-NEXT: retq %1 = sdiv <4 x i32> %x, %x ret <4 x i32> %1 } diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index f4b13ee495ec0..e013d8cd33598 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -135,19 +135,14 @@ define <4 x i32> @combine_vec_udiv_dupe(<4 x i32> %x) { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vec_udiv_dupe: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vec_udiv_dupe: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; AVX2-NEXT: retq +; AVX-LABEL: combine_vec_udiv_dupe: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX-NEXT: retq ; ; XOP-LABEL: combine_vec_udiv_dupe: ; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] ; XOP-NEXT: retq %1 = udiv <4 x i32> %x, %x ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index 1e891b5330a3c..538b8ed10f25b 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -469,61 +469,33 @@ define i32 @main() nounwind { ; X64-SSSE3-NEXT: popq %rbp ; X64-SSSE3-NEXT: retq ; -; X64-AVX1-LABEL: main: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: pushq %rbp -; X64-AVX1-NEXT: movq %rsp, %rbp -; X64-AVX1-NEXT: andq $-32, %rsp -; X64-AVX1-NEXT: subq $64, %rsp -; X64-AVX1-NEXT: movq n1@GOTPCREL(%rip), %rax -; X64-AVX1-NEXT: vmovaps (%rax), %ymm0 -; X64-AVX1-NEXT: movl zero+4(%rip), %ecx -; X64-AVX1-NEXT: movl zero+8(%rip), %eax -; X64-AVX1-NEXT: vmovaps %ymm0, zero(%rip) -; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] -; X64-AVX1-NEXT: vmovaps %ymm0, (%rsp) -; X64-AVX1-NEXT: vmovaps (%rsp), %ymm0 -; X64-AVX1-NEXT: vextractps $2, %xmm0, %esi -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %esi -; X64-AVX1-NEXT: movl %eax, %esi -; X64-AVX1-NEXT: vextractps $1, %xmm0, %edi -; X64-AVX1-NEXT: movl %ecx, %eax -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %edi -; X64-AVX1-NEXT: addl %esi, %eax -; X64-AVX1-NEXT: movq %rbp, %rsp -; X64-AVX1-NEXT: popq %rbp -; X64-AVX1-NEXT: vzeroupper -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: main: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: pushq %rbp -; X64-AVX2-NEXT: movq %rsp, %rbp -; X64-AVX2-NEXT: andq $-32, %rsp -; X64-AVX2-NEXT: subq $64, %rsp -; X64-AVX2-NEXT: movq n1@GOTPCREL(%rip), %rax -; X64-AVX2-NEXT: vmovaps (%rax), %ymm0 -; X64-AVX2-NEXT: movl zero+4(%rip), %ecx -; X64-AVX2-NEXT: movl zero+8(%rip), %eax -; X64-AVX2-NEXT: vmovaps %ymm0, zero(%rip) -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] -; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) -; X64-AVX2-NEXT: vmovaps (%rsp), %ymm0 -; X64-AVX2-NEXT: vextractps $2, %xmm0, %esi -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %esi -; X64-AVX2-NEXT: movl %eax, %esi -; X64-AVX2-NEXT: vextractps $1, %xmm0, %edi -; X64-AVX2-NEXT: movl %ecx, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %edi -; X64-AVX2-NEXT: addl %esi, %eax -; X64-AVX2-NEXT: movq %rbp, %rsp -; X64-AVX2-NEXT: popq %rbp -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: main: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: pushq %rbp +; X64-AVX-NEXT: movq %rsp, %rbp +; X64-AVX-NEXT: andq $-32, %rsp +; X64-AVX-NEXT: subq $64, %rsp +; X64-AVX-NEXT: movq n1@GOTPCREL(%rip), %rax +; X64-AVX-NEXT: vmovaps (%rax), %ymm0 +; X64-AVX-NEXT: movl zero+4(%rip), %ecx +; X64-AVX-NEXT: movl zero+8(%rip), %eax +; X64-AVX-NEXT: vmovaps %ymm0, zero(%rip) +; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; X64-AVX-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX-NEXT: vmovaps (%rsp), %ymm0 +; X64-AVX-NEXT: vextractps $2, %xmm0, %esi +; X64-AVX-NEXT: xorl %edx, %edx +; X64-AVX-NEXT: divl %esi +; X64-AVX-NEXT: movl %eax, %esi +; X64-AVX-NEXT: vextractps $1, %xmm0, %edi +; X64-AVX-NEXT: movl %ecx, %eax +; X64-AVX-NEXT: xorl %edx, %edx +; X64-AVX-NEXT: divl %edi +; X64-AVX-NEXT: addl %esi, %eax +; X64-AVX-NEXT: movq %rbp, %rsp +; X64-AVX-NEXT: popq %rbp +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq %stackptr = alloca <8 x i32>, align 32 %z = load <8 x i32>, ptr @zero, align 32 %t1 = load <8 x i32>, ptr @n1, align 32 diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll index 2a3c3e3c7f4f7..bb8ee2238a004 100644 --- a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll +++ b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll @@ -189,7 +189,7 @@ define <4 x double> @negated_constant_v4f64_fadd(<4 x double> %a) { define <4 x double> @negated_constant_v4f64_2fma_undefs(<4 x double> %a, <4 x double> %b) { ; FMA3-LABEL: negated_constant_v4f64_2fma_undefs: ; FMA3: # %bb.0: -; FMA3-NEXT: vmovapd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; FMA3-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; FMA3-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + mem ; FMA3-NEXT: vfmadd132pd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2 ; FMA3-NEXT: vaddpd %ymm1, %ymm0, %ymm0 @@ -197,7 +197,7 @@ define <4 x double> @negated_constant_v4f64_2fma_undefs(<4 x double> %a, <4 x do ; ; FMA4-LABEL: negated_constant_v4f64_2fma_undefs: ; FMA4: # %bb.0: -; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; FMA4-NEXT: vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + mem ; FMA4-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * mem) + ymm2 ; FMA4-NEXT: vaddpd %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll index e4cc8f23fd38e..a886a3c830340 100644 --- a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll @@ -160,7 +160,7 @@ entry: define <4 x float> @test_mm_fnmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: test_mm_fnmsub_ps: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %xmm3, %xmm0, %xmm4 ; CHECK-NEXT: vxorps %xmm3, %xmm2, %xmm0 ; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0 @@ -175,7 +175,8 @@ entry: define <2 x double> @test_mm_fnmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: test_mm_fnmsub_pd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: # xmm3 = mem[0,0] ; CHECK-NEXT: vxorpd %xmm3, %xmm0, %xmm4 ; CHECK-NEXT: vxorpd %xmm3, %xmm2, %xmm0 ; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0 @@ -342,7 +343,7 @@ entry: define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) { ; CHECK-LABEL: test_mm256_fnmsub_ps: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vxorps %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0 @@ -357,7 +358,7 @@ entry: define <4 x double> @test_mm256_fnmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) { ; CHECK-LABEL: test_mm256_fnmsub_pd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorpd %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vxorpd %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0 diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll index bac41849a4108..aa99672b8fc6a 100644 --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -791,14 +791,14 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x fl define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA4-INFS-NEXT: retq @@ -832,14 +832,14 @@ define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) { define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -873,14 +873,14 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0> +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0> +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -914,14 +914,14 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA4-INFS-NEXT: retq @@ -955,14 +955,14 @@ define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -996,14 +996,14 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0> +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0> +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -1318,7 +1318,7 @@ define float @test_f32_interp(float %x, float %y, float %t) { define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) { ; FMA-INFS-LABEL: test_v4f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3 ; FMA-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 @@ -1326,7 +1326,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float ; ; FMA4-INFS-LABEL: test_v4f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3 ; FMA4-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; FMA4-INFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 @@ -1367,7 +1367,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) { ; FMA-INFS-LABEL: test_v8f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 ; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 @@ -1375,7 +1375,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float ; ; FMA4-INFS-LABEL: test_v8f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 ; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 @@ -1465,7 +1465,8 @@ define double @test_f64_interp(double %x, double %y, double %t) { define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) { ; FMA-INFS-LABEL: test_v2f64_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA-INFS-NEXT: # xmm3 = mem[0,0] ; FMA-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3 ; FMA-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1 ; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 @@ -1473,7 +1474,8 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do ; ; FMA4-INFS-LABEL: test_v2f64_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: # xmm3 = mem[0,0] ; FMA4-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3 ; FMA4-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1 ; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 @@ -1515,7 +1517,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) { ; FMA-INFS-LABEL: test_v4f64_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3 ; FMA-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 @@ -1523,7 +1525,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do ; ; FMA4-INFS-LABEL: test_v4f64_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3 ; FMA4-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll index 9d190a18c4552..fe5ddca67470c 100644 --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -259,7 +259,7 @@ define <8 x double> @test_8f64_fmsub_load(ptr %a0, <8 x double> %a1, <8 x double define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -268,7 +268,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> % ; ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -305,7 +305,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> % define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -314,7 +314,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -351,7 +351,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -360,7 +360,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float ; ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -397,7 +397,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -406,7 +406,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -443,7 +443,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -452,7 +452,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> % ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -490,7 +490,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> % define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -499,7 +499,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -537,7 +537,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -546,7 +546,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -584,7 +584,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -593,7 +593,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -631,7 +631,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -640,7 +640,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> % ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -677,7 +677,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> % define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -686,7 +686,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -723,7 +723,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -732,7 +732,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -769,7 +769,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -778,7 +778,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -819,7 +819,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) { ; FMA-INFS-LABEL: test_v16f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 ; FMA-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 ; FMA-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 @@ -830,7 +830,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x ; ; FMA4-INFS-LABEL: test_v16f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 ; FMA4-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 ; FMA4-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 @@ -878,7 +878,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) { ; FMA-INFS-LABEL: test_v8f64_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7 ; FMA-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6 ; FMA-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3 @@ -889,7 +889,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do ; ; FMA4-INFS-LABEL: test_v8f64_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7 ; FMA4-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6 ; FMA4-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3 @@ -1143,7 +1143,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> % ; FMA: # %bb.0: ; FMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA-NEXT: vxorpd %ymm2, %ymm0, %ymm0 ; FMA-NEXT: vxorpd %ymm2, %ymm1, %ymm1 ; FMA-NEXT: retq @@ -1152,7 +1152,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> % ; FMA4: # %bb.0: ; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA4-NEXT: vxorpd %ymm2, %ymm0, %ymm0 ; FMA4-NEXT: vxorpd %ymm2, %ymm1, %ymm1 ; FMA4-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll index 9c2a7adf5431a..5bb5d1e9c17ec 100644 --- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -1070,21 +1070,15 @@ define <4 x float> @test_fmaximum_vector_signed_zero(<4 x float> %x) { ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX1-LABEL: test_fmaximum_vector_signed_zero: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX1-NEXT: vmaxps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: test_fmaximum_vector_signed_zero: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: test_fmaximum_vector_signed_zero: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq ; ; X86-LABEL: test_fmaximum_vector_signed_zero: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> ) @@ -1283,21 +1277,15 @@ define <4 x float> @test_fmaximum_vector_signed_zero_first(<4 x float> %x) { ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; AVX1-LABEL: test_fmaximum_vector_signed_zero_first: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX1-NEXT: vmaxps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: test_fmaximum_vector_signed_zero_first: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: test_fmaximum_vector_signed_zero_first: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq ; ; X86-LABEL: test_fmaximum_vector_signed_zero_first: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 ; X86-NEXT: retl %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> , <4 x float> %x) diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll index 3ff68ac329c99..3f8bd24c38049 100644 --- a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll +++ b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll @@ -11,12 +11,14 @@ define <4 x i16> @test_sext_4i8_4i16() { ; X32-LABEL: test_sext_4i8_4i16: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u> +; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533] +; X32-NEXT: # xmm0 = mem[0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u> +; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533] +; X64-NEXT: # xmm0 = mem[0,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -29,12 +31,14 @@ define <4 x i16> @test_sext_4i8_4i16() { define <4 x i16> @test_sext_4i8_4i16_undef() { ; X32-LABEL: test_sext_4i8_4i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,0,65533,u,u,u,u> +; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533] +; X32-NEXT: # xmm0 = mem[0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,0,65533,u,u,u,u> +; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533] +; X64-NEXT: # xmm0 = mem[0,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -207,12 +211,14 @@ define <8 x i32> @test_sext_8i8_8i32_undef() { define <4 x i16> @test_zext_4i8_4i16() { ; X32-LABEL: test_zext_4i8_4i16: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u> +; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253] +; X32-NEXT: # xmm0 = mem[0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i16: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,2,253,u,u,u,u> +; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253] +; X64-NEXT: # xmm0 = mem[0,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -261,12 +267,14 @@ define <4 x i64> @test_zext_4i8_4i64() { define <4 x i16> @test_zext_4i8_4i16_undef() { ; X32-LABEL: test_zext_4i8_4i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u> +; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253] +; X32-NEXT: # xmm0 = mem[0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,255,0,253,u,u,u,u> +; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253] +; X64-NEXT: # xmm0 = mem[0,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 diff --git a/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll b/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll index 73c7dc1fae56f..41989122a01eb 100644 --- a/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll +++ b/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll @@ -7,7 +7,8 @@ define <4 x float> @test1() { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: ret{{[l|q]}} %1 = trunc <4 x i3> to <4 x i1> %2 = sitofp <4 x i1> %1 to <4 x float> diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll index e8f3f069d01b3..8efd5819a6d22 100644 --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -572,9 +572,9 @@ define <16 x float> @round_v16f32(<16 x float> %x) { ; ; AVX1-LABEL: round_v16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] ; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 @@ -680,9 +680,9 @@ define <8 x double> @round_v8f64(<8 x double> %x) { ; ; AVX1-LABEL: round_v8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3 -; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] ; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll index 16f3b0a48f48b..364fd81eb1aa9 100644 --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -219,13 +219,15 @@ define <2 x double> @elt1_v2f64(double %x) { ; ; X86-AVX-LABEL: elt1_v2f64: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u> +; X86-AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4.2E+1,4.2E+1] +; X86-AVX-NEXT: # xmm0 = mem[0,0] ; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: elt1_v2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u> +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; X64-AVX-NEXT: # xmm1 = mem[0,0] ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X64-AVX-NEXT: retq %ins = insertelement <2 x double> , double %x, i32 1 diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll index ebcb45d8b0f43..2eef32eb61414 100644 --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -156,12 +156,12 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind { define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind { ; X86-LABEL: knownbits_mask_or_shuffle_uitofp: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] +; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] ; X86-NEXT: retl ; ; X64-LABEL: knownbits_mask_or_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] +; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] ; X64-NEXT: retq %1 = and <4 x i32> %a0, %2 = or <4 x i32> %1, @@ -385,7 +385,7 @@ define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) n ; X86-LABEL: knownbits_mask_concat_uitofp: ; X86: # %bb.0: ; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] -; X86-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071] +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071] ; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; X86-NEXT: vandps %xmm2, %xmm0, %xmm0 @@ -396,7 +396,7 @@ define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) n ; X64-LABEL: knownbits_mask_concat_uitofp: ; X64: # %bb.0: ; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] -; X64-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071] +; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [131071,131071,131071,131071] ; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index b756165172650..17548df343251 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -376,7 +376,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 @@ -764,7 +764,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 @@ -2221,7 +2221,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v16i32_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 @@ -2897,7 +2897,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; ; AVX1-LABEL: truncstore_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 @@ -4879,7 +4879,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; ; AVX1-LABEL: truncstore_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index c15e73daceb9b..682e2002c075a 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -232,7 +232,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm6, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm7 @@ -545,7 +546,8 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -1018,7 +1020,8 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -1393,7 +1396,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4294967295,4294967295] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 @@ -1588,7 +1592,8 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 @@ -1869,7 +1874,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm6, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 @@ -2099,7 +2105,8 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -2115,7 +2122,8 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295] +; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -2234,7 +2242,8 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX-LABEL: truncstore_v2i64_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [65535,65535] +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] ; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 @@ -2381,7 +2390,8 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX-LABEL: truncstore_v2i64_v2i8: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [255,255] +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [255,255] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] ; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll index 5a0c703ae2ea4..96ac8dff79530 100644 --- a/llvm/test/CodeGen/X86/memset-nonzero.ll +++ b/llvm/test/CodeGen/X86/memset-nonzero.ll @@ -28,7 +28,7 @@ define void @memset_16_nonzero_bytes(ptr %x) { ; ; AVX-LABEL: memset_16_nonzero_bytes: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX-NEXT: vmovups %xmm0, (%rdi) ; AVX-NEXT: retq %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 16, i64 -1) @@ -54,7 +54,7 @@ define void @memset_32_nonzero_bytes(ptr %x) { ; ; AVX-LABEL: memset_32_nonzero_bytes: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX-NEXT: vmovups %ymm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -87,7 +87,7 @@ define void @memset_64_nonzero_bytes(ptr %x) { ; ; AVX1-LABEL: memset_64_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) ; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper @@ -95,7 +95,7 @@ define void @memset_64_nonzero_bytes(ptr %x) { ; ; AVX2-LABEL: memset_64_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX2-NEXT: vmovups %ymm0, 32(%rdi) ; AVX2-NEXT: vmovups %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper @@ -110,7 +110,7 @@ define void @memset_64_nonzero_bytes(ptr %x) { ; ; AVX512BW-LABEL: memset_64_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX512BW-NEXT: vmovups %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -156,7 +156,7 @@ define void @memset_128_nonzero_bytes(ptr %x) { ; ; AVX1-LABEL: memset_128_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX1-NEXT: vmovups %ymm0, 96(%rdi) ; AVX1-NEXT: vmovups %ymm0, 64(%rdi) ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) @@ -166,7 +166,7 @@ define void @memset_128_nonzero_bytes(ptr %x) { ; ; AVX2-LABEL: memset_128_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX2-NEXT: vmovups %ymm0, 96(%rdi) ; AVX2-NEXT: vmovups %ymm0, 64(%rdi) ; AVX2-NEXT: vmovups %ymm0, 32(%rdi) @@ -184,7 +184,7 @@ define void @memset_128_nonzero_bytes(ptr %x) { ; ; AVX512BW-LABEL: memset_128_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) ; AVX512BW-NEXT: vmovups %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper @@ -223,7 +223,7 @@ define void @memset_256_nonzero_bytes(ptr %x) { ; ; AVX1-LABEL: memset_256_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX1-NEXT: vmovups %ymm0, 224(%rdi) ; AVX1-NEXT: vmovups %ymm0, 192(%rdi) ; AVX1-NEXT: vmovups %ymm0, 160(%rdi) @@ -237,7 +237,7 @@ define void @memset_256_nonzero_bytes(ptr %x) { ; ; AVX2-LABEL: memset_256_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX2-NEXT: vmovups %ymm0, 224(%rdi) ; AVX2-NEXT: vmovups %ymm0, 192(%rdi) ; AVX2-NEXT: vmovups %ymm0, 160(%rdi) @@ -261,7 +261,7 @@ define void @memset_256_nonzero_bytes(ptr %x) { ; ; AVX512BW-LABEL: memset_256_nonzero_bytes: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] ; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi) ; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi) ; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) diff --git a/llvm/test/CodeGen/X86/merge-store-constants.ll b/llvm/test/CodeGen/X86/merge-store-constants.ll index e7778c0aaf322..8030d5f08fa57 100644 --- a/llvm/test/CodeGen/X86/merge-store-constants.ll +++ b/llvm/test/CodeGen/X86/merge-store-constants.ll @@ -58,14 +58,14 @@ define void @big_nonzero_32_bytes_splat(ptr nocapture %a) { ; X32-LABEL: big_nonzero_32_bytes_splat: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] +; X32-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] ; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: big_nonzero_32_bytes_splat: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] +; X64-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] ; X64-NEXT: vmovups %ymm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index e17e9d3a6573e..fa36c15b6445a 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -546,7 +546,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, ptr %p) nounwind { ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm2 ; AVX2-FAST-ALL-NEXT: vbroadcastsd %xmm1, %ymm3 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] @@ -1547,7 +1547,8 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7] +; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> @@ -1773,7 +1774,8 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind { ; AVX2-FAST-ALL-NEXT: vmovups (%rsi), %ymm0 ; AVX2-FAST-ALL-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-ALL-NEXT: vmovups (%rcx), %ymm2 -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = <5,u,u,6,u,u,7,u> +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [5,0,7,6,5,0,7,6] +; AVX2-FAST-ALL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm3, %ymm3 ; AVX2-FAST-ALL-NEXT: vbroadcastsd 24(%rsi), %ymm4 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll index d4f3d4b9d1401..40d6ec6fb3155 100644 --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -613,7 +613,7 @@ define <64 x i8> @test17(<64 x i8> %x) { ; ; AVX1-LABEL: test17: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3 ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 @@ -1421,7 +1421,7 @@ define <32 x i16> @test35(<32 x i16> %x) { ; ; AVX1-LABEL: test35: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3 ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 diff --git a/llvm/test/CodeGen/X86/pr30290.ll b/llvm/test/CodeGen/X86/pr30290.ll index 74e553191331f..478cb142475da 100644 --- a/llvm/test/CodeGen/X86/pr30290.ll +++ b/llvm/test/CodeGen/X86/pr30290.ll @@ -20,7 +20,7 @@ define void @foo(ptr byval(%struct.face) nocapture align 8) local_unnamed_addr { ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 diff --git a/llvm/test/CodeGen/X86/pr32368.ll b/llvm/test/CodeGen/X86/pr32368.ll index c10bacea688aa..52cf6fb07d672 100644 --- a/llvm/test/CodeGen/X86/pr32368.ll +++ b/llvm/test/CodeGen/X86/pr32368.ll @@ -114,12 +114,12 @@ define <16 x float> @PR32368_512(<16 x float>) { ; ; AVX1-LABEL: PR32368_512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll index c4a085b0b32a8..15cc7581454aa 100644 --- a/llvm/test/CodeGen/X86/pr38639.ll +++ b/llvm/test/CodeGen/X86/pr38639.ll @@ -4,11 +4,12 @@ define <8 x double> @test(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1] ; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1] +; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1] +; CHECK-NEXT: # xmm2 = mem[0,0] ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> , <8 x i32> diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 55e2342e8b0e6..faeaef7b40a62 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -948,7 +948,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 @@ -1746,7 +1746,8 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind { ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -2803,7 +2804,8 @@ define <8 x i32> @test33(<8 x i32> %a0, <8 x i64> %a1) { ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -3027,7 +3029,8 @@ define <8 x i32> @test34(<8 x i32> %a0, <8 x i64> %a1) { ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll index 2279212ac8771..7e9bbc5556424 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -333,53 +333,11 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-RECIP-LABEL: v4f32_no_estimate: -; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: retq -; -; FMA-RECIP-LABEL: v4f32_no_estimate: -; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; FMA-RECIP-NEXT: retq -; -; BDVER2-LABEL: v4f32_no_estimate: -; BDVER2: # %bb.0: -; BDVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: retq -; -; BTVER2-LABEL: v4f32_no_estimate: -; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq -; -; SANDY-LABEL: v4f32_no_estimate: -; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq -; -; HASWELL-LABEL: v4f32_no_estimate: -; HASWELL: # %bb.0: -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; HASWELL-NEXT: retq -; -; HASWELL-NO-FMA-LABEL: v4f32_no_estimate: -; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: retq -; -; AVX512-LABEL: v4f32_no_estimate: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: v4f32_no_estimate: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -400,7 +358,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -422,7 +380,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; ; BTVER2-LABEL: v4f32_one_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -434,7 +392,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -585,7 +543,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -598,7 +556,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; FMA-RECIP-LABEL: v4f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 @@ -609,7 +567,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; BDVER2-LABEL: v4f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 -; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 @@ -618,7 +576,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; ; BTVER2-LABEL: v4f32_two_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 @@ -634,7 +592,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -694,53 +652,11 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-RECIP-LABEL: v8f32_no_estimate: -; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: retq -; -; FMA-RECIP-LABEL: v8f32_no_estimate: -; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; FMA-RECIP-NEXT: retq -; -; BDVER2-LABEL: v8f32_no_estimate: -; BDVER2: # %bb.0: -; BDVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; BDVER2-NEXT: retq -; -; BTVER2-LABEL: v8f32_no_estimate: -; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: retq -; -; SANDY-LABEL: v8f32_no_estimate: -; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: retq -; -; HASWELL-LABEL: v8f32_no_estimate: -; HASWELL: # %bb.0: -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; HASWELL-NEXT: retq -; -; HASWELL-NO-FMA-LABEL: v8f32_no_estimate: -; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: retq -; -; AVX512-LABEL: v8f32_no_estimate: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: v8f32_no_estimate: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -768,7 +684,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -790,7 +706,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; ; BTVER2-LABEL: v8f32_one_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -802,7 +718,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -879,7 +795,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -892,7 +808,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; FMA-RECIP-LABEL: v8f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 @@ -903,7 +819,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; BDVER2-LABEL: v8f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 @@ -912,7 +828,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; ; BTVER2-LABEL: v8f32_two_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 @@ -928,7 +844,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -996,35 +912,35 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { ; ; AVX-RECIP-LABEL: v16f32_no_estimate: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_no_estimate: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_no_estimate: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_no_estimate: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_no_estimate: ; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: retq @@ -1089,7 +1005,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1103,7 +1019,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { ; FMA-RECIP-LABEL: v16f32_one_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 @@ -1114,7 +1030,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { ; BDVER2-LABEL: v16f32_one_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vrcpps %ymm1, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 @@ -1124,7 +1040,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { ; ; BTVER2-LABEL: v16f32_one_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vrcpps %ymm1, %ymm4 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -1141,7 +1057,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vrcpps %ymm1, %ymm4 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 @@ -1249,7 +1165,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 @@ -1271,7 +1187,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { ; FMA-RECIP-LABEL: v16f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 @@ -1288,7 +1204,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { ; BDVER2-LABEL: v16f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 @@ -1302,7 +1218,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { ; ; BTVER2-LABEL: v16f32_two_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 @@ -1327,7 +1243,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll index 5bb08e6ce2846..2a5e46bba2c00 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -476,7 +476,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -504,7 +504,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; ; BTVER2-LABEL: v4f32_one_step_2_divs: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -518,7 +518,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -595,7 +595,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -610,7 +610,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; FMA-RECIP-LABEL: v4f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] @@ -632,7 +632,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 @@ -650,7 +650,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -838,7 +838,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -866,7 +866,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; ; BTVER2-LABEL: v8f32_one_step_2_divs: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -880,7 +880,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -972,7 +972,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -987,7 +987,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] @@ -1009,7 +1009,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 @@ -1027,7 +1027,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -1327,7 +1327,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1345,7 +1345,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { ; FMA-RECIP-LABEL: v16f32_one_step_2_divs: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 @@ -1360,7 +1360,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { ; BDVER2-LABEL: v16f32_one_step_2_divs: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2 @@ -1374,7 +1374,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { ; ; BTVER2-LABEL: v16f32_one_step_2_divs: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 @@ -1395,7 +1395,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1526,7 +1526,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 @@ -1552,7 +1552,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { ; FMA-RECIP-LABEL: v16f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 @@ -1572,7 +1572,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { ; BDVER2-LABEL: v16f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] @@ -1590,7 +1590,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { ; ; BTVER2-LABEL: v16f32_two_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 @@ -1619,7 +1619,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 { ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index 34eaec95e5ac3..cb89a6595ad3b 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -1063,7 +1063,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 @@ -1197,7 +1197,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX1-LABEL: v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 @@ -1207,7 +1208,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm3 = mem[0,0] ; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 @@ -1217,7 +1219,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512F-LABEL: v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: # xmm3 = mem[0,0] ; AVX512F-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 @@ -1733,7 +1736,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX1-NEXT: vxorpd %ymm5, %ymm4, %ymm4 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm7, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index b421fa2408039..48a3155cea341 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -656,7 +656,8 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) { ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_min: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573] +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573] +; AVX2-NEXT: # xmm1 = mem[0,0] ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index f2508cd22f2d4..b042ce13bd627 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -315,7 +315,8 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm0 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps (%rdi), %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, (%rsi) ; AVX2-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/splat-const.ll b/llvm/test/CodeGen/X86/splat-const.ll index b2b27347f234e..b9fd29658367f 100644 --- a/llvm/test/CodeGen/X86/splat-const.ll +++ b/llvm/test/CodeGen/X86/splat-const.ll @@ -38,7 +38,7 @@ define <4 x i32> @const_vector() { ; ; AVX-LABEL: const_vector: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42] ; AVX-NEXT: retq ; ; AVX2-LABEL: const_vector: diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll index 2a0b9285f3249..6d6a7b897c332 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -64,7 +64,7 @@ define <4 x float> @v4f32_no_daz(<4 x float> %f) #0 { ; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SNB-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SNB-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0 ; SNB-NEXT: retq @@ -152,7 +152,7 @@ define <8 x float> @v8f32_no_daz(<8 x float> %f) #0 { ; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 ; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SNB-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0 ; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0 ; SNB-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll index 54ea207ac5dc6..1c1df175bdb6f 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -210,7 +210,7 @@ define <4 x float> @sqrt_v4f32_check_denorms_ieee_ninf(<4 x float> %x) #3 { ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -261,7 +261,7 @@ define <4 x float> @sqrt_v4f32_check_denorms_dynamic_ninf(<4 x float> %x) #6 { ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -360,19 +360,12 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { ; SSE-NEXT: divps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: v4f32_no_estimate: -; AVX1: # %bb.0: -; AVX1-NEXT: vsqrtps %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX1-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: v4f32_no_estimate: -; AVX512: # %bb.0: -; AVX512-NEXT: vsqrtps %xmm0, %xmm0 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: v4f32_no_estimate: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtps %xmm0, %xmm0 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) %div = fdiv fast <4 x float> , %sqrt ret <4 x float> %div @@ -431,7 +424,7 @@ define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 { ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -461,19 +454,12 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { ; SSE-NEXT: divps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: v8f32_no_estimate: -; AVX1: # %bb.0: -; AVX1-NEXT: vsqrtps %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: v8f32_no_estimate: -; AVX512: # %bb.0: -; AVX512-NEXT: vsqrtps %ymm0, %ymm0 -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: v8f32_no_estimate: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtps %ymm0, %ymm0 +; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) %div = fdiv fast <8 x float> , %sqrt ret <8 x float> %div @@ -544,7 +530,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { ; AVX1: # %bb.0: ; AVX1-NEXT: vsqrtps %ymm1, %ymm1 ; AVX1-NEXT: vsqrtps %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq @@ -595,11 +581,11 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { ; AVX1-LABEL: v16f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %ymm0, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vrsqrtps %ymm1, %ymm5 ; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0 @@ -985,7 +971,8 @@ define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, ptr %p) nou ; AVX-LABEL: sqrt_simplify_before_recip_vec: ; AVX: # %bb.0: ; AVX-NEXT: vsqrtpd %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vmovupd %xmm1, (%rdi) ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll index 95eb23fc3cd5d..b042f122541b0 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -521,20 +521,10 @@ define <4 x i32> @test_srem_one_eq(<4 x i32> %X) nounwind { ; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX1-LABEL: test_srem_one_eq: -; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX1-NEXT: retq -; -; CHECK-AVX2-LABEL: test_srem_one_eq: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512VL-LABEL: test_srem_one_eq: -; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX512VL-NEXT: retq +; CHECK-AVX-LABEL: test_srem_one_eq: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX-NEXT: retq %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, %ret = zext <4 x i1> %cmp to <4 x i32> @@ -677,20 +667,10 @@ define <4 x i32> @test_srem_allones(<4 x i32> %X) nounwind { ; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX1-LABEL: test_srem_allones: -; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX1-NEXT: retq -; -; CHECK-AVX2-LABEL: test_srem_allones: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512VL-LABEL: test_srem_allones: -; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX512VL-NEXT: retq +; CHECK-AVX-LABEL: test_srem_allones: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX-NEXT: retq %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, %ret = zext <4 x i1> %cmp to <4 x i32> diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll index a651648a9d727..231f274db83cd 100644 --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -601,17 +601,11 @@ define fastcc void @test17() nounwind { ; X86-SSE-NEXT: movaps %xmm0, (%eax) ; X86-SSE-NEXT: retl ; -; X86-AVX1-LABEL: test17: -; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = -; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) -; X86-AVX1-NEXT: retl -; -; X86-AVX512-LABEL: test17: -; X86-AVX512: # %bb.0: # %entry -; X86-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] -; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) -; X86-AVX512-NEXT: retl +; X86-AVX-LABEL: test17: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: test17: ; X64-SSE: # %bb.0: # %entry @@ -619,17 +613,11 @@ define fastcc void @test17() nounwind { ; X64-SSE-NEXT: movaps %xmm0, (%rax) ; X64-SSE-NEXT: retq ; -; X64-AVX1-LABEL: test17: -; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = -; X64-AVX1-NEXT: vmovaps %xmm0, (%rax) -; X64-AVX1-NEXT: retq -; -; X64-AVX512-LABEL: test17: -; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] -; X64-AVX512-NEXT: vmovaps %xmm0, (%rax) -; X64-AVX512-NEXT: retq +; X64-AVX-LABEL: test17: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X64-AVX-NEXT: vmovaps %xmm0, (%rax) +; X64-AVX-NEXT: retq entry: %0 = insertelement <4 x i32> undef, i32 undef, i32 1 %1 = shufflevector <4 x i32> , <4 x i32> %0, <4 x i32> @@ -712,3 +700,8 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { %m = mul <4 x i32> %x, %y ret <4 x i32> %m } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64-AVX1: {{.*}} +; X64-AVX512: {{.*}} +; X86-AVX1: {{.*}} +; X86-AVX512: {{.*}} diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index bd9ee00d32e70..72a3e74ff0a7f 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -48,8 +48,10 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; ; X64-AVX2-LABEL: vec_v2i64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807] +; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: # xmm2 = mem[0,0] +; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807] +; X64-AVX2-NEXT: # xmm3 = mem[0,0] ; X64-AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm3 ; X64-AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 ; X64-AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm4 diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index c8fd7e89c605f..21f1fd6c8da21 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -1126,7 +1126,7 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 @@ -1292,7 +1292,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq @@ -1304,7 +1305,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq @@ -1316,7 +1318,8 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: # xmm2 = mem[0,0] ; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX512F-NEXT: retq @@ -1959,7 +1962,7 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind { ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll index 8f770f98bc5ce..0b9a413d00b1d 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -443,20 +443,10 @@ define <4 x i32> @test_urem_one_eq(<4 x i32> %X) nounwind { ; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; CHECK-SSE-NEXT: retq ; -; CHECK-AVX1-LABEL: test_urem_one_eq: -; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX1-NEXT: retq -; -; CHECK-AVX2-LABEL: test_urem_one_eq: -; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512VL-LABEL: test_urem_one_eq: -; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; CHECK-AVX512VL-NEXT: retq +; CHECK-AVX-LABEL: test_urem_one_eq: +; CHECK-AVX: # %bb.0: +; CHECK-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX-NEXT: retq %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, %ret = zext <4 x i1> %cmp to <4 x i32> diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll index 212d9764622de..c053acd17a1fd 100644 --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -240,7 +240,7 @@ define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) { define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 { ; X86-LABEL: two_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -248,7 +248,7 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 { ; ; X64-LABEL: two_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -298,7 +298,7 @@ entry: define <8 x i32> @three_ands(<8 x float> %x) { ; X86-LABEL: three_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -309,7 +309,7 @@ define <8 x i32> @three_ands(<8 x float> %x) { ; ; X64-LABEL: three_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -374,7 +374,7 @@ entry: define <8 x i32> @four_ands(<8 x float> %x) { ; X86-LABEL: four_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -387,7 +387,7 @@ define <8 x i32> @four_ands(<8 x float> %x) { ; ; X64-LABEL: four_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -464,7 +464,7 @@ entry: define <8 x i32> @five_ands(<8 x float> %x) { ; X86-LABEL: five_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -479,7 +479,7 @@ define <8 x i32> @five_ands(<8 x float> %x) { ; ; X64-LABEL: five_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -568,7 +568,7 @@ entry: define <8 x i32> @two_or(<8 x float> %x) { ; X86-LABEL: two_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -576,7 +576,7 @@ define <8 x i32> @two_or(<8 x float> %x) { ; ; X64-LABEL: two_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -628,7 +628,7 @@ entry: define <8 x i32> @three_or(<8 x float> %x) { ; X86-LABEL: three_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -639,7 +639,7 @@ define <8 x i32> @three_or(<8 x float> %x) { ; ; X64-LABEL: three_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -708,7 +708,7 @@ entry: define <8 x i32> @four_or(<8 x float> %x) { ; X86-LABEL: four_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -721,7 +721,7 @@ define <8 x i32> @four_or(<8 x float> %x) { ; ; X64-LABEL: four_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -804,7 +804,7 @@ entry: define <8 x i32> @five_or(<8 x float> %x) { ; X86-LABEL: five_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -819,7 +819,7 @@ define <8 x i32> @five_or(<8 x float> %x) { ; ; X64-LABEL: five_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -916,7 +916,7 @@ entry: define <8 x i32> @three_or_and(<8 x float> %x) { ; X86-LABEL: three_or_and: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -927,7 +927,7 @@ define <8 x i32> @three_or_and(<8 x float> %x) { ; ; X64-LABEL: three_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -994,7 +994,7 @@ entry: define <8 x i32> @four_or_and(<8 x float> %x) { ; X86-LABEL: four_or_and: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 @@ -1007,7 +1007,7 @@ define <8 x i32> @four_or_and(<8 x float> %x) { ; ; X64-LABEL: four_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 @@ -1086,7 +1086,7 @@ entry: define <8 x i32> @five_or_and(<8 x float> %x) { ; X86-LABEL: five_or_and: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1101,7 +1101,7 @@ define <8 x i32> @five_or_and(<8 x float> %x) { ; ; X64-LABEL: five_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1194,7 +1194,7 @@ entry: define <8 x i32> @four_or_and_xor(<8 x float> %x) { ; X86-LABEL: four_or_and_xor: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %ymm2, %ymm1, %ymm1 @@ -1207,7 +1207,7 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) { ; ; X64-LABEL: four_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1 @@ -1288,7 +1288,7 @@ entry: define <8 x i32> @five_or_and_xor(<8 x float> %x) { ; X86-LABEL: five_or_and_xor: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1303,7 +1303,7 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) { ; ; X64-LABEL: five_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1397,7 +1397,7 @@ entry: define <8 x i32> @six_or_and_xor(<8 x float> %x) { ; X86-LABEL: six_or_and_xor: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1414,7 +1414,7 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) { ; ; X64-LABEL: six_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll index a11f13e606c30..349d94d930651 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -3063,10 +3063,10 @@ define <4 x i32> @strict_vector_fptoui_v4f32_to_v4i32(<4 x float> %a) #0 { ; ; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll index d072072414f42..b28211bb4388f 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -1113,12 +1113,12 @@ define <4 x i32> @strict_vector_fptosi_v4f64_to_v4i32(<4 x double> %a) #0 { define <4 x i32> @strict_vector_fptoui_v4f64_to_v4i32(<4 x double> %a) #0 { ; AVX-LABEL: strict_vector_fptoui_v4f64_to_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] ; AVX-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2] ; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] ; AVX-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3 ; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 @@ -1379,10 +1379,10 @@ define <8 x i32> @strict_vector_fptosi_v8f32_to_v8i32(<8 x float> %a) #0 { define <8 x i32> @strict_vector_fptoui_v8f32_to_v8i32(<8 x float> %a) #0 { ; AVX-LABEL: strict_vector_fptoui_v8f32_to_v8i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] ; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm4 ; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vsubps %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll index d5a08299a00a6..edba0caabc15f 100644 --- a/llvm/test/CodeGen/X86/vec_anyext.ll +++ b/llvm/test/CodeGen/X86/vec_anyext.ll @@ -211,7 +211,8 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind { define <4 x i16> @const_16_32() nounwind { ; CHECK-LABEL: const_16_32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,3,8,7,u,u,u,u> +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: ret{{[l|q]}} %G = trunc <4 x i32> to <4 x i16> ret <4 x i16> %G @@ -220,7 +221,8 @@ define <4 x i16> @const_16_32() nounwind { define <4 x i16> @const_16_64() nounwind { ; CHECK-LABEL: const_16_64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,3,8,7,u,u,u,u> +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: ret{{[l|q]}} %G = trunc <4 x i64> to <4 x i16> ret <4 x i16> %G diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll index fb01a18ea9280..982062d890754 100644 --- a/llvm/test/CodeGen/X86/vec_fabs.ll +++ b/llvm/test/CodeGen/X86/vec_fabs.ll @@ -149,7 +149,7 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p) define <8 x double> @fabs_v8f64(<8 x double> %p) { ; X86-AVX-LABEL: fabs_v8f64: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X86-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] ; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X86-AVX-NEXT: retl @@ -166,7 +166,7 @@ define <8 x double> @fabs_v8f64(<8 x double> %p) { ; ; X64-AVX-LABEL: fabs_v8f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] ; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64-AVX-NEXT: retq @@ -188,7 +188,7 @@ declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p) define <16 x float> @fabs_v16f32(<16 x float> %p) { ; X86-AVX-LABEL: fabs_v16f32: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X86-AVX-NEXT: retl @@ -205,7 +205,7 @@ define <16 x float> @fabs_v16f32(<16 x float> %p) { ; ; X64-AVX-LABEL: fabs_v16f32: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64-AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index 04609f02d333e..4f7a4676390f8 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -1912,7 +1912,8 @@ define <4 x i32> @fptosi_2f64_to_2i32_const() { ; ; AVX-LABEL: fptosi_2f64_to_2i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,1,4294967295,1] +; AVX-NEXT: # xmm0 = mem[0,0] ; AVX-NEXT: retq %cvt = fptosi <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> @@ -1970,7 +1971,8 @@ define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) { ; ; AVX-LABEL: fptoui_2f64_to_2i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [2,4,2,4] +; AVX-NEXT: # xmm0 = mem[0,0] ; AVX-NEXT: retq %cvt = fptoui <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 54133eef2ef20..24e05bd937b0c 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -4786,7 +4786,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovapd (%rdi), %ymm2 ; AVX1-NEXT: vmovapd 32(%rdi), %ymm3 -; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,1,1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1] ; AVX1-NEXT: vandpd %ymm4, %ymm3, %ymm5 ; AVX1-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 @@ -5640,7 +5640,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] @@ -5649,7 +5650,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX1-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vsubpd %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovupd %xmm0, (%rdi) @@ -5666,7 +5668,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: # xmm6 = mem[0,0] ; AVX2-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] @@ -5675,7 +5678,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX2-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vsubpd %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovupd %xmm0, (%rdi) @@ -5692,7 +5696,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: # xmm6 = mem[0,0] ; AVX512F-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX512F-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] @@ -5701,7 +5706,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX512F-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512F-NEXT: vsubpd %xmm6, %xmm1, %xmm1 ; AVX512F-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512F-NEXT: # xmm2 = mem[0,0] ; AVX512F-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vmovupd %xmm0, (%rdi) @@ -5742,7 +5748,8 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX512DQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512DQ-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512DQ-NEXT: # xmm2 = mem[0,0] ; AVX512DQ-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovupd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index be58e3958dea7..9a43d312f1322 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -431,7 +431,8 @@ define <2 x double> @constrained_vector_fmul_v2f64() #0 { ; ; AVX-LABEL: constrained_vector_fmul_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: # xmm0 = mem[0,0] ; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -493,7 +494,8 @@ define <3 x double> @constrained_vector_fmul_v3f64() #0 { ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq @@ -516,17 +518,11 @@ define <4 x double> @constrained_vector_fmul_v4f64() #0 { ; CHECK-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; -; AVX1-LABEL: constrained_vector_fmul_v4f64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: constrained_vector_fmul_v4f64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX512-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: constrained_vector_fmul_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq entry: %mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64( <4 x double> @constrained_vector_fadd_v2f64() #0 { ; ; AVX-LABEL: constrained_vector_fadd_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: # xmm0 = mem[0,0] ; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -631,7 +628,8 @@ define <3 x double> @constrained_vector_fadd_v3f64() #0 { ; AVX: # %bb.0: # %entry ; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq @@ -654,17 +652,11 @@ define <4 x double> @constrained_vector_fadd_v4f64() #0 { ; CHECK-NEXT: addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; -; AVX1-LABEL: constrained_vector_fadd_v4f64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: constrained_vector_fadd_v4f64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: constrained_vector_fadd_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq entry: %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64( <4 x double> @constrained_vector_fsub_v2f64() #0 { ; ; AVX-LABEL: constrained_vector_fsub_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308] +; AVX-NEXT: # xmm0 = mem[0,0] ; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -772,7 +765,8 @@ define <3 x double> @constrained_vector_fsub_v3f64() #0 { ; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq @@ -795,17 +789,11 @@ define <4 x double> @constrained_vector_fsub_v4f64() #0 { ; CHECK-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; -; AVX1-LABEL: constrained_vector_fsub_v4f64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] -; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: constrained_vector_fsub_v4f64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] -; AVX512-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: constrained_vector_fsub_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308] +; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq entry: %sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64( <4 x double> @constrained_vector_fptoui_v4i32_v4f32() #0 { ; ; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] ; AVX1-NEXT: vcmpltps %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vsubps %xmm0, %xmm1, %xmm0 @@ -5010,13 +4998,13 @@ define <4 x i32> @constrained_vector_fptoui_v4i32_v4f64() #0 { ; ; AVX1-LABEL: constrained_vector_fptoui_v4i32_v4f64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] ; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] ; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2] ; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] ; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3 ; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index adae44774b182..0500d6ec6e1f9 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -22,7 +22,7 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63] ; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 @@ -123,7 +123,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 9b230ccefd3c8..4a580c8bacabe 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -22,7 +22,7 @@ declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63] ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 @@ -124,7 +124,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [63,63,63,63] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 3e65c31cf83a1..a6e64e1d8f6d0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -328,7 +328,8 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 8c7a91013144e..8b1aae61ed5c3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -177,7 +177,8 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf4: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm0 = mem[0,0] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 @@ -187,16 +188,19 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <3,7,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsi) @@ -336,7 +340,8 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm5 ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm5, %ymm6 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 @@ -350,7 +355,8 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = <1,5,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm11, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] @@ -358,7 +364,8 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] @@ -368,7 +375,8 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = <3,7,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm5, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -673,7 +681,8 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm10 ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm9, %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] @@ -695,7 +704,8 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] @@ -716,7 +726,8 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm10 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] @@ -732,7 +743,8 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <3,7,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] @@ -1379,7 +1391,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -1440,7 +1453,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <1,5,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1494,7 +1508,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -1532,7 +1547,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm10, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm2 = <3,7,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2855,7 +2871,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2999,7 +3016,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3118,7 +3136,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm2 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] @@ -3206,7 +3225,8 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm1 = <3,7,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm1 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 872fd8698cca1..d28215f89fdc1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -86,10 +86,12 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <4,2,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,2,4,2] +; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] +; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-ONLY-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rdx) @@ -114,11 +116,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vmovaps {{.*#+}} xmm2 = <4,2,u,u> +; AVX512F-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2] +; AVX512F-SLOW-NEXT: # xmm2 = mem[0,0] ; AVX512F-SLOW-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX512F-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512F-SLOW-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u> +; AVX512F-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] +; AVX512F-SLOW-NEXT: # xmm6 = mem[0,0] ; AVX512F-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi) ; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx) @@ -143,11 +147,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u> ; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512F-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> +; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] +; AVX512F-FAST-NEXT: # xmm1 = mem[0,0] ; AVX512F-FAST-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512F-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u> +; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] +; AVX512F-FAST-NEXT: # xmm6 = mem[0,0] ; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 ; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi) ; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx) @@ -172,11 +178,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-SLOW-NEXT: vmovaps {{.*#+}} xmm2 = <4,2,u,u> +; AVX512BW-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2] +; AVX512BW-SLOW-NEXT: # xmm2 = mem[0,0] ; AVX512BW-SLOW-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512BW-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u> +; AVX512BW-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] +; AVX512BW-SLOW-NEXT: # xmm6 = mem[0,0] ; AVX512BW-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 ; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rsi) ; AVX512BW-SLOW-NEXT: vmovq %xmm1, (%rdx) @@ -201,11 +209,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u> ; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> +; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] +; AVX512BW-FAST-NEXT: # xmm1 = mem[0,0] ; AVX512BW-FAST-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512BW-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512BW-FAST-NEXT: vmovaps {{.*#+}} xmm6 = <5,3,u,u> +; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] +; AVX512BW-FAST-NEXT: # xmm6 = mem[0,0] ; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 ; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-FAST-NEXT: vmovq %xmm4, (%rdx) @@ -672,7 +682,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm11 = mem[0,0] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] @@ -683,7 +694,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm5 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] @@ -727,7 +739,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm9 = [2,4,2,4,2,4,2,4] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8 ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] @@ -757,7 +769,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm11 = mem[0,0] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] @@ -768,7 +781,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm5 = mem[0,0] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] @@ -842,7 +856,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm11 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm11 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] @@ -853,7 +868,8 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] @@ -1529,7 +1545,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm5 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] @@ -1550,7 +1567,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm4 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] @@ -1704,7 +1722,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm5 = mem[0,0] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] @@ -1725,7 +1744,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm4 = mem[0,0] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] @@ -1880,7 +1900,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] @@ -1901,7 +1922,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm4 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] @@ -3455,7 +3477,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] @@ -3516,7 +3539,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm14 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] @@ -3863,7 +3887,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] @@ -3921,7 +3946,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm14 = mem[0,0] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] @@ -4270,7 +4296,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] @@ -4331,7 +4358,8 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm14 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm14 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] @@ -7520,7 +7548,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2] +; AVX2-SLOW-NEXT: # xmm8 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7660,7 +7689,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] @@ -8382,7 +8412,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2] +; AVX2-FAST-NEXT: # xmm8 = mem[0,0] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8524,7 +8555,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] +; AVX2-FAST-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] @@ -9251,7 +9283,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm8 = [4,2,4,2] +; AVX2-FAST-PERLANE-NEXT: # xmm8 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -9391,7 +9424,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm1 = [5,3,5,3] +; AVX2-FAST-PERLANE-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index 011485f16168e..f9713d1eab16c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -100,7 +100,8 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm4 = <4,3,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3] +; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] @@ -421,7 +422,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm10 = <4,3,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] +; AVX2-SLOW-NEXT: # xmm10 = mem[0,0] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] @@ -457,7 +459,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vbroadcastss 84(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <1,0,7,u,u,u,u,u> +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,7,0,1,0,7,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm5 @@ -478,7 +481,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <4,3,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] +; AVX2-FAST-NEXT: # xmm10 = mem[0,0] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] @@ -535,7 +539,8 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm10 = <4,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] @@ -4298,7 +4303,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm3 = <4,3,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm3 = [4,3,4,3] +; AVX2-SLOW-NEXT: # xmm3 = mem[0,0] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] @@ -5321,7 +5327,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm3 = <4,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm3 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] @@ -9202,7 +9209,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u> +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3] +; AVX2-SLOW-NEXT: # xmm5 = mem[0,0] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] @@ -10244,7 +10252,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u> +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3] +; AVX2-FAST-NEXT: # xmm5 = mem[0,0] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] @@ -11293,7 +11302,8 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm5 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: # xmm5 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index fede3ba1ca14b..a4482bafbd535 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -136,7 +136,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-NEXT: vinsertf128 $1, (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7,3,7,3,7,3,7,3] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3] @@ -306,7 +306,8 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <5,u,u,6,u,u,7,u> +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [5,0,7,6,5,0,7,6] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd 24(%rdi), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 0a28126d1b3ac..17bd3eb320104 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -244,7 +244,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm5 = [7,3,7,3,7,3,7,3] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index 083e6e7f4b1de..5d478ae0f3e25 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -265,7 +265,8 @@ define void @store_i32_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,4,1,5,0,4,1,5] ; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 6bbba6fc39143..ab3122960f53c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -139,7 +139,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,u,1> ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,4,0,0,2,4,0] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm2 @@ -439,7 +440,8 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm9 = +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4] +; AVX2-SLOW-NEXT: # xmm9 = mem[0,0] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm6 ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1] ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] @@ -476,14 +478,14 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [5,1,5,1,5,1,5,1] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [5,0,2,6,5,0,2,6] ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [7,3,7,3,7,3,7,3] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [2,6,0,3,2,6,0,3] ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] @@ -493,7 +495,8 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm5 = [0,4,0,4] +; AVX2-FAST-NEXT: # xmm5 = mem[0,0] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,4,0,1,0,4,0,1] ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] @@ -504,7 +507,7 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vbroadcastss (%r10), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] @@ -548,7 +551,8 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4] +; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,0] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1] ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index c20180523661e..69d8fa57cd482 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -263,7 +263,8 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm6, %ymm9 ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,4,0,4] +; AVX2-ONLY-NEXT: # xmm9 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] @@ -272,7 +273,8 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm10 ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm10 = <1,5,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm10 = [1,5,1,5] +; AVX2-ONLY-NEXT: # xmm10 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] @@ -281,7 +283,8 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm11 ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [2,6,2,6] +; AVX2-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm11, %ymm8 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] @@ -290,7 +293,8 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm7 ; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = <3,7,u,u> +; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [3,7,3,7] +; AVX2-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index c57f4d9cb59b2..320b63ee20bd5 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -255,7 +255,7 @@ define i64 @test_v16i64_v16i8(<16 x i64> %a0) { ; ; AVX1-LABEL: test_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,1,1,1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1,1,1,1] ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -577,7 +577,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ; ; AVX1-SLOW-LABEL: test_v16i32_v16i8: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -595,7 +595,7 @@ define i32 @test_v16i32_v16i8(<16 x i32> %a0) { ; ; AVX1-FAST-LABEL: test_v16i32_v16i8: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-FAST-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -694,7 +694,7 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) { ; ; AVX1-SLOW-LABEL: test_v32i32_v32i8: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -720,7 +720,7 @@ define i32 @test_v32i32_v32i8(<32 x i32> %a0) { ; ; AVX1-FAST-LABEL: test_v32i32_v32i8: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX1-FAST-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -1222,7 +1222,7 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) { ; ; AVX1-LABEL: test_v64i16_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index e4cc9731c6105..5f8c06625a93c 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -633,7 +633,7 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) { ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index a3dd5bf3a1d42..7e7ba8b9ae65b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2211,7 +2211,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -2247,7 +2247,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 8a302e026b6b4..30b743cb7bdfd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -903,7 +903,8 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) { ; ; AVX2-FAST-ALL-LABEL: shuffle_v8f32_c348cda0: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,4,0,0,3,4,0] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] ; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] @@ -953,7 +954,7 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) { ; ; AVX2-FAST-ALL-LABEL: shuffle_v8f32_f511235a: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2> +; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7,2,7,2,7,2,7,2] ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll index 66d2ccff6d77f..90b5e70a0a302 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -300,7 +300,8 @@ define <8 x float> @expand15(<4 x float> %a) { ; AVX512-FAST-LABEL: expand15: ; AVX512-FAST: # %bb.0: ; AVX512-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-FAST-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX512-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [1,0,0,0,1,0,0,0] +; AVX512-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] ; AVX512-FAST-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 0ad7b898e07e8..b34af730565e4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -501,7 +501,8 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,3,0,10,0,1,0] ; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4 ; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx) -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,0,11,0,u,u,u,u> +; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,0,11,0,3,0,11,0] +; X86-AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,0,8,0,9,0,3,0] ; X86-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0 @@ -568,7 +569,8 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,3,10,1] ; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3 ; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi) -; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,11,u,u> +; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,11,3,11] +; X64-AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,8,9,3] ; X64-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index c87869e6c71f6..1b9648e77162e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2842,7 +2842,8 @@ define <4 x float> @PR30264(<4 x float> %x) { ; ; AVX-LABEL: PR30264: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.0E+0,1.0E+0,4.0E+0,1.0E+0] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x float> %x, <4 x float> , <4 x i32> @@ -3471,37 +3472,21 @@ define void @SpinningCube() { ; SSE41-NEXT: movaps %xmm2, (%rax) ; SSE41-NEXT: retq ; -; AVX1-LABEL: SpinningCube: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> -; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX1-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovaps %xmm2, (%rax) -; AVX1-NEXT: vbroadcastss (%rax), %xmm2 -; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovaps %xmm0, (%rax) -; AVX1-NEXT: retq -; -; AVX2-LABEL: SpinningCube: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> -; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX2-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovaps %xmm2, (%rax) -; AVX2-NEXT: vbroadcastss (%rax), %xmm2 -; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovaps %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX-LABEL: SpinningCube: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, (%rax) +; AVX-NEXT: vbroadcastss (%rax), %xmm2 +; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rax) +; AVX-NEXT: retq entry: store float 1.000000e+00, ptr undef, align 4 %0 = load float, ptr undef, align 4 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 39dc8662c7a4e..37b996bfe686a 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -528,7 +528,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_add_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -628,7 +628,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -696,7 +696,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1276,7 +1276,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1376,7 +1376,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -1444,7 +1444,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1686,7 +1686,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX1-LABEL: trunc_mul_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -2202,7 +2202,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2567,7 +2567,8 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -2617,7 +2618,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2730,7 +2731,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -2806,7 +2807,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2928,7 +2929,8 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -2973,7 +2975,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3073,7 +3075,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3141,7 +3143,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3264,7 +3266,8 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -3314,7 +3317,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3427,7 +3430,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3503,7 +3506,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3625,7 +3628,8 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -3670,7 +3674,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3770,7 +3774,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3838,7 +3842,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3961,7 +3965,8 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -4011,7 +4016,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4124,7 +4129,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -4200,7 +4205,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4322,7 +4327,8 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -4367,7 +4373,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4467,7 +4473,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -4535,7 +4541,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 46ae1bdd8e654..f6e4377f64fa7 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -514,7 +514,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 51b013c63b70c..f687374baea4b 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -69,7 +69,8 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) { ; ; AVX-LABEL: trunc_usat_v2i64_v2i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -166,7 +167,8 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; ; AVX-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -307,7 +309,8 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,429496729] ; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [4294967295,4294967295] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX1-NEXT: vzeroupper @@ -334,7 +337,8 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -582,7 +586,8 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) { ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4294967295,4294967295] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -728,7 +733,8 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX1-LABEL: trunc_usat_v2i64_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -739,7 +745,8 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] ; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -750,7 +757,8 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -848,7 +856,8 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) { ; ; AVX1-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -860,7 +869,8 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) { ; ; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] ; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -872,7 +882,8 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) { ; ; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1021,7 +1032,8 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) { ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 @@ -1187,7 +1199,8 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [65535,65535] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 @@ -1447,7 +1460,8 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -1993,7 +2007,8 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) { ; ; AVX-LABEL: trunc_usat_v2i64_v2i8: ; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -2093,7 +2108,8 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) { ; ; AVX-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -2241,7 +2257,8 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -2411,7 +2428,8 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -2663,7 +2681,8 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -2904,7 +2923,8 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -3295,7 +3315,8 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) { ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 87cc6a86d7dd0..b5fa7312f7121 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -201,7 +201,7 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { ; ; AVX1-LABEL: trunc8i64_8i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -279,7 +279,7 @@ define void @trunc8i64_8i8(<8 x i64> %a) { ; ; AVX1-LABEL: trunc8i64_8i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -671,7 +671,7 @@ define void @trunc16i32_16i16(<16 x i32> %a) { ; ; AVX1-LABEL: trunc16i32_16i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 @@ -897,7 +897,7 @@ define void @trunc16i32_16i8(<16 x i32> %a) { ; ; AVX1-LABEL: trunc16i32_16i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1295,7 +1295,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; ; AVX1-LABEL: trunc32i16_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 @@ -1733,7 +1733,7 @@ define <32 x i8> @trunc2x16i16_32i8(<16 x i16> %a, <16 x i16> %b) { ; ; AVX1-LABEL: trunc2x16i16_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 @@ -2146,7 +2146,7 @@ define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, ptr %p) a ; ; AVX1-LABEL: store_merge_split: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index cf25020877255..367e0993e76ba 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -47,7 +47,7 @@ define void @test2(ptr %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq (%rdi,%rsi,8), %rax -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] ; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX1-NEXT: vmovupd %ymm0, (%rax) ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll index cc37f2c0c5484..1b576b28ce831 100644 --- a/llvm/test/CodeGen/X86/vselect-zero.ll +++ b/llvm/test/CodeGen/X86/vselect-zero.ll @@ -125,7 +125,8 @@ define double @fsel_nonzero_false_val(double %x, double %y, double %z) { ; AVX-LABEL: fsel_nonzero_false_val: ; AVX: # %bb.0: ; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -179,7 +180,8 @@ define double @fsel_nonzero_constants(double %x, double %y) { ; AVX-LABEL: fsel_nonzero_constants: ; AVX: # %bb.0: ; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/win_cst_pool.ll b/llvm/test/CodeGen/X86/win_cst_pool.ll index 5bc9d34e4b35d..1fc05b26fddb5 100644 --- a/llvm/test/CodeGen/X86/win_cst_pool.ll +++ b/llvm/test/CodeGen/X86/win_cst_pool.ll @@ -65,16 +65,14 @@ define <8 x i16> @vec2() { define <4 x float> @undef1() { ret <4 x float> -; CHECK: .globl __xmm@00000000000000003f8000003f800000 -; CHECK-NEXT: .section .rdata,"dr",discard,__xmm@00000000000000003f8000003f800000 -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: __xmm@00000000000000003f8000003f800000: -; CHECK-NEXT: .long 0x3f800000 # float 1 +; CHECK: .globl __real@3f800000 +; CHECK-NEXT: .section .rdata,"dr",discard,__real@3f800000 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: __real@3f800000: ; CHECK-NEXT: .long 0x3f800000 # float 1 -; CHECK-NEXT: .zero 4 -; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .text ; CHECK: undef1: -; CHECK: movaps __xmm@00000000000000003f8000003f800000(%rip), %xmm0 +; CHECK: vbroadcastss __real@3f800000(%rip), %xmm0 ; CHECK-NEXT: ret } From 8a56a730f26c52eae38b7614edcc6c37ea033f48 Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Mon, 29 May 2023 08:52:40 +0530 Subject: [PATCH 007/704] [MLIR] Add output argument to affineParallelize utility Add output argument to affineParallelize utility. NFC. Differential Revision: https://reviews.llvm.org/D151636 --- mlir/include/mlir/Dialect/Affine/Utils.h | 9 +++++---- mlir/lib/Dialect/Affine/Utils/Utils.cpp | 5 ++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h index 8e54a02a89105..ca52f1771737f 100644 --- a/mlir/include/mlir/Dialect/Affine/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Utils.h @@ -44,10 +44,11 @@ using ReductionLoopMap = DenseMap>; /// (mlir::isLoopParallel can be used to detect a parallel affine.for op.) The /// reductions specified in `parallelReductions` are also parallelized. /// Parallelization will fail in the presence of loop iteration arguments that -/// are not listed in `parallelReductions`. -LogicalResult -affineParallelize(AffineForOp forOp, - ArrayRef parallelReductions = {}); +/// are not listed in `parallelReductions`. `resOp` if non-null is set to the +/// newly created affine.parallel op. +LogicalResult affineParallelize(AffineForOp forOp, + ArrayRef parallelReductions = {}, + AffineParallelOp *resOp = nullptr); /// Hoists out affine.if/else to as high as possible, i.e., past all invariant /// affine.fors/parallel's. Returns success if any hoisting happened; folded` is diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp index 4e02b612b9bfe..d567093188e0c 100644 --- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp @@ -344,7 +344,8 @@ static AffineIfOp hoistAffineIfOp(AffineIfOp ifOp, Operation *hoistOverOp) { LogicalResult mlir::affine::affineParallelize(AffineForOp forOp, - ArrayRef parallelReductions) { + ArrayRef parallelReductions, + AffineParallelOp *resOp) { // Fail early if there are iter arguments that are not reductions. unsigned numReductions = parallelReductions.size(); if (numReductions != forOp.getNumIterOperands()) @@ -398,6 +399,8 @@ mlir::affine::affineParallelize(AffineForOp forOp, newPloop.getBody()->eraseArguments(numIVs, numReductions); forOp.erase(); + if (resOp) + *resOp = newPloop; return success(); } From 420cf6927c35449f234549389e6ce18371cdda24 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Sun, 28 May 2023 22:17:20 -0700 Subject: [PATCH 008/704] [LSV] Return same bitwidth from getConstantOffset. Previously, getConstantOffset could return an APInt with a different bitwidth than the input pointers. For example, we might be loading an opaque 64-bit pointer, but stripAndAccumulateInBoundsConstantOffsets might give a 32-bit offset. This was OK in most cases because in gatherChains, we casted the APInt back to the original ASPtrBits. But it was not OK when considering selects. We'd call getConstantOffset twice and compare the resulting APInt's, which might not have the same bit width. This fixes that. Now getConstantOffset always returns offsets with the correct width, so we don't need the hack of casting it in gatherChains, and it works correctly when we're handling selects. Differential Revision: https://reviews.llvm.org/D151640 --- .../Vectorize/LoadStoreVectorizer.cpp | 19 +++++++++++-------- .../AMDGPU/vect-ptr-ptr-size-mismatch.ll | 17 +++++++++++++++++ 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index d4a1815719065..043892c799074 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -1441,8 +1441,7 @@ std::vector Vectorizer::gatherChains(ArrayRef Instrs) { if (Offset.has_value()) { // `Offset` might not have the expected number of bits, if e.g. AS has a // different number of bits than opaque pointers. - ChainIter->second.push_back( - ChainElem{I, Offset.value().sextOrTrunc(ASPtrBits)}); + ChainIter->second.push_back(ChainElem{I, Offset.value()}); // Move ChainIter to the front of the MRU list. MRU.remove(*ChainIter); MRU.push_front(*ChainIter); @@ -1475,9 +1474,11 @@ std::optional Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst << ", Depth=" << Depth << "\n"); - unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType()); - APInt OffsetA(OffsetBitWidth, 0); - APInt OffsetB(OffsetBitWidth, 0); + // We'll ultimately return a value of this bit width, even if computations + // happen in a different width. + unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType()); + APInt OffsetA(OrigBitWidth, 0); + APInt OffsetB(OrigBitWidth, 0); PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); @@ -1493,7 +1494,7 @@ std::optional Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); if (PtrA == PtrB) - return OffsetB - OffsetA; + return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth); // Try to compute B - A. const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA)); @@ -1501,11 +1502,13 @@ std::optional Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB, LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n"); ConstantRange DistRange = SE.getSignedRange(DistScev); if (DistRange.isSingleElement()) - return OffsetB - OffsetA + *DistRange.getSingleElement(); + return (OffsetB - OffsetA + *DistRange.getSingleElement()) + .sextOrTrunc(OrigBitWidth); } std::optional Diff = getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth); if (Diff.has_value()) - return OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()); + return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth())) + .sextOrTrunc(OrigBitWidth); return std::nullopt; } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll index f3575e5edd764..aec5bca3b6fd2 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/vect-ptr-ptr-size-mismatch.ll @@ -62,6 +62,23 @@ entry: unreachable } +; CHECK-LABEL: @select_different_as +; CHECK: load <2 x i32> +define void @select_different_as(ptr addrspace(1) %p0, ptr addrspace(5) %q0, i1 %cond) { +entry: + %p1 = getelementptr inbounds i32, ptr addrspace(1) %p0, i64 1 + %q1 = getelementptr inbounds i32, ptr addrspace(5) %q0, i64 1 + %p0.ascast = addrspacecast ptr addrspace(1) %p0 to ptr + %p1.ascast = addrspacecast ptr addrspace(1) %p1 to ptr + %q0.ascast = addrspacecast ptr addrspace(5) %q0 to ptr + %q1.ascast = addrspacecast ptr addrspace(5) %q1 to ptr + %sel0 = select i1 %cond, ptr %p0.ascast, ptr %q0.ascast + %sel1 = select i1 %cond, ptr %p1.ascast, ptr %q1.ascast + %tmp1 = load i32, ptr %sel0, align 8 + %tmp2 = load i32, ptr %sel1, align 8 + unreachable +} + ; CHECK-LABEL: @shrink_ptr ; CHECK: load <2 x i32> define void @shrink_ptr(ptr %p) { From a3a8cbffcf00bdbf19a15a070313b60c0a9ce342 Mon Sep 17 00:00:00 2001 From: Ivan Kosarev Date: Mon, 29 May 2023 16:37:13 +0100 Subject: [PATCH 009/704] [AMDGPU][AsmParser][NFC] Refine parsing of NamedOperandU32 operands. Eliminates the need for the custom code in parseCustomOperand(). Part of . Reviewed By: dp Differential Revision: https://reviews.llvm.org/D150980 --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 20 +---- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 73 +++++++------------ llvm/lib/Target/AMDGPU/SMInstructions.td | 3 +- 3 files changed, 28 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index e768588f39dc3..7b4817f373b56 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1761,7 +1761,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { AMDGPUOperand::Ptr defaultSMEMOffsetMod() const; AMDGPUOperand::Ptr defaultFlatOffset() const; - OperandMatchResultTy parseOModOperand(OperandVector &Operands); + OperandMatchResultTy parseOModSI(OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); @@ -8048,7 +8048,7 @@ void AMDGPUAsmParser::onBeginOfFile() { getTargetStreamer().EmitDirectiveAMDGCNTarget(); } -OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) { +OperandMatchResultTy AMDGPUAsmParser::parseOModSI(OperandVector &Operands) { StringRef Name = getTokenStr(); if (Name == "mul") { return parseIntWithPrefix("mul", Operands, @@ -9129,24 +9129,8 @@ AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands, unsigned MCK) { return parseTokenOp("off", Operands); case MCK_row_95_en: return parseTokenOp("row_en", Operands); - case MCK_ImmCPol: - return parseCPol(Operands); case MCK_gds: return parseNamedBit("gds", Operands, AMDGPUOperand::ImmTyGDS); - case MCK_ImmNegHi: - return parseOperandArrayWithPrefix("neg_hi", Operands, - AMDGPUOperand::ImmTyNegHi); - case MCK_ImmNegLo: - return parseOperandArrayWithPrefix("neg_lo", Operands, - AMDGPUOperand::ImmTyNegLo); - case MCK_ImmOModSI: - return parseOModOperand(Operands); - case MCK_ImmOpSel: - return parseOperandArrayWithPrefix("op_sel", Operands, - AMDGPUOperand::ImmTyOpSel); - case MCK_ImmOpSelHi: - return parseOperandArrayWithPrefix("op_sel_hi", Operands, - AMDGPUOperand::ImmTyOpSelHi); case MCK_tfe: return parseNamedBit("tfe", Operands, AMDGPUOperand::ImmTyTFE); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 47675a78114e3..dfb0b74ef320b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1121,15 +1121,6 @@ def SDWAVopcDst : BoolRC { let PrintMethod = "printVOPDst"; } -class NamedMatchClass : AsmOperandClass { - let Name = "Imm"#CName; - let PredicateMethod = "is"#CName; - let ParserMethod = !if(Optional, "", "parse"#CName); - let RenderMethod = "addImmOperands"; - let IsOptional = Optional; - let DefaultMethod = !if(Optional, "default"#CName, ?); -} - class CustomOperandClass : AsmOperandClass { let Name = CName; let PredicateMethod = "is"#CName; @@ -1143,6 +1134,7 @@ class CustomOperandProps> { string PrintMethod = "print"#Name; AsmOperandClass ParserMatchClass = Class; + string OperandType = "OPERAND_IMMEDIATE"; } class CustomOperand class NamedBitOperand : CustomOperand>; -class DefaultOperand_0 - : OperandWithDefaultOps, +class DefaultOperand + : OperandWithDefaultOps, CustomOperandProps<1, Op.ParserMatchClass.Name, Op.ParserMatchClass>; -class NamedOperandU32 : Operand { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; -} - -class NamedOperandU32_0 : - OperandWithDefaultOps { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; -} - -class NamedOperandU32Default0 : - OperandWithDefaultOps { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; -} - -class NamedOperandU32Default1 : - OperandWithDefaultOps { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; -} - class SDWAOperandClass : CustomOperandClass { string ImmTy = "AMDGPUOperand::ImmTy"#Name; @@ -1211,7 +1180,17 @@ class SDWAOperandClass class SDWAOperand : CustomOperand>; -let OperandType = "OPERAND_IMMEDIATE" in { +class ArrayOperandClass + : CustomOperandClass { + string ImmTy = "AMDGPUOperand::ImmTy"#Name; + let ParserMethod = + "[this](OperandVector &Operands) -> OperandMatchResultTy { "# + "return parseOperandArrayWithPrefix(\""#Id#"\", Operands, "#ImmTy#"); }"; +} + +class ArrayOperand0 + : OperandWithDefaultOps, + CustomOperandProps<1, Name, ArrayOperandClass>; def flat_offset : CustomOperand; def offset : NamedIntOperand; @@ -1220,23 +1199,23 @@ def offset1 : NamedIntOperand; def gds : NamedBitOperand<"gds", "GDS">; -def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>; -def omod0 : NamedOperandU32_0<"OModSI", NamedMatchClass<"OModSI">>; +def omod : CustomOperand; +def omod0 : DefaultOperand; // We need to make the cases with a default of 0 distinct from no // default to help deal with some cases where the operand appears // before a mandatory operand. def clampmod : NamedBitOperand<"clamp", "ClampSI">; -def clampmod0 : DefaultOperand_0; +def clampmod0 : DefaultOperand; def highmod : NamedBitOperand<"high", "High">; -def CPol : NamedOperandU32<"CPol", NamedMatchClass<"CPol">>; -def CPol_0 : NamedOperandU32Default0<"CPol", NamedMatchClass<"CPol">>; -def CPol_GLC1 : NamedOperandU32Default1<"CPol", NamedMatchClass<"CPol">>; +def CPol : CustomOperand; +def CPol_0 : DefaultOperand; +def CPol_GLC1 : DefaultOperand; def TFE : NamedBitOperand<"tfe">; def SWZ : NamedBitOperand<"swz">; -def SWZ_0 : DefaultOperand_0; +def SWZ_0 : DefaultOperand; def UNorm : NamedBitOperand<"unorm">; def DA : NamedBitOperand<"da">; def R128A16 : CustomOperand; @@ -1256,10 +1235,10 @@ def src0_sel : SDWAOperand<"src0_sel", "SDWASrc0Sel">; def src1_sel : SDWAOperand<"src1_sel", "SDWASrc1Sel">; def dst_unused : CustomOperand; -def op_sel0 : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>; -def op_sel_hi0 : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; -def neg_lo0 : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; -def neg_hi0 : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; +def op_sel0 : ArrayOperand0<"op_sel", "OpSel">; +def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">; +def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">; +def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">; def dpp8 : CustomOperand; def dpp_ctrl : CustomOperand; @@ -1281,8 +1260,6 @@ def exp_tgt : CustomOperand; def wait_vdst : NamedIntOperand; def wait_exp : NamedIntOperand; -} // End OperandType = "OPERAND_IMMEDIATE" - class KImmMatchClass : AsmOperandClass { let Name = "KImmFP"#size; let PredicateMethod = "isKImmFP"#size; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 62c4deae52104..2d0d2f3bb0dfb 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -8,8 +8,7 @@ def smrd_offset_8 : ImmOperand; -let OperandType = "OPERAND_IMMEDIATE", - EncoderMethod = "getSMEMOffsetEncoding", +let EncoderMethod = "getSMEMOffsetEncoding", DecoderMethod = "decodeSMEMOffset" in { def smem_offset : ImmOperand; def smem_offset_mod : NamedIntOperand; From 196d89740c5e8bf238200b7f95e6173b231aa5d2 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Mon, 29 May 2023 17:58:50 +0200 Subject: [PATCH 010/704] [mlir][llvm] Add rounding intrinsics Add some of the missing libm rounding intrinsics to the LLVM dialect: * `llvm.rint` * `llvm.nearbyint` * `llvm.lround` * `llvm.llround` * `llvm.lrint` * `llvm.llrint` Differential Revision: https://reviews.llvm.org/D151558 --- .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td | 12 +++ mlir/test/Target/LLVMIR/Import/intrinsic.ll | 86 ++++++++++++++++++ .../test/Target/LLVMIR/llvmir-intrinsics.mlir | 90 +++++++++++++++++++ 3 files changed, 188 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index eb815b3f0b0d4..a409223ade155 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -130,6 +130,18 @@ def LLVM_PowIOp : LLVM_OneResultIntrOp<"powi", [], [0,1], let assemblyFormat = "`(` operands `)` custom(attr-dict) `:` " "functional-type(operands, results)"; } +def LLVM_RintOp : LLVM_UnaryIntrOpF<"rint">; +def LLVM_NearbyintOp : LLVM_UnaryIntrOpF<"nearbyint">; +class LLVM_IntRoundIntrOpBase : + LLVM_OneResultIntrOp { + let arguments = (ins LLVM_AnyFloat:$val); + let assemblyFormat = "`(` operands `)` custom(attr-dict) `:` " + "functional-type(operands, results)"; +} +def LLVM_LroundOp : LLVM_IntRoundIntrOpBase<"lround">; +def LLVM_LlroundOp : LLVM_IntRoundIntrOpBase<"llround">; +def LLVM_LrintOp : LLVM_IntRoundIntrOpBase<"lrint">; +def LLVM_LlrintOp : LLVM_IntRoundIntrOpBase<"llrint">; def LLVM_BitReverseOp : LLVM_UnaryIntrOpI<"bitreverse">; def LLVM_ByteSwapOp : LLVM_UnaryIntrOpI<"bswap">; def LLVM_CountLeadingZerosOp : LLVM_CountZerosIntrOp<"ctlz">; diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll index 811dc44973410..e9b361509d037 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll @@ -117,6 +117,72 @@ define void @pow_test(float %0, float %1, <8 x float> %2, <8 x float> %3) { %6 = call <8 x float> @llvm.pow.v8f32(<8 x float> %2, <8 x float> %3) ret void } + +; CHECK-LABEL: llvm.func @rint_test +define void @rint_test(float %0, double %1, <8 x float> %2, <8 x double> %3) { + ; CHECK: llvm.intr.rint(%{{.*}}) : (f32) -> f32 + %5 = call float @llvm.rint.f32(float %0) + ; CHECK: llvm.intr.rint(%{{.*}}) : (f64) -> f64 + %6 = call double @llvm.rint.f64(double %1) + ; CHECK: llvm.intr.rint(%{{.*}}) : (vector<8xf32>) -> vector<8xf32> + %7 = call <8 x float> @llvm.rint.v8f32(<8 x float> %2) + ; CHECK: llvm.intr.rint(%{{.*}}) : (vector<8xf64>) -> vector<8xf64> + %8 = call <8 x double> @llvm.rint.v8f64(<8 x double> %3) + ret void +} +; CHECK-LABEL: llvm.func @nearbyint_test +define void @nearbyint_test(float %0, double %1, <8 x float> %2, <8 x double> %3) { + ; CHECK: llvm.intr.nearbyint(%{{.*}}) : (f32) -> f32 + %5 = call float @llvm.nearbyint.f32(float %0) + ; CHECK: llvm.intr.nearbyint(%{{.*}}) : (f64) -> f64 + %6 = call double @llvm.nearbyint.f64(double %1) + ; CHECK: llvm.intr.nearbyint(%{{.*}}) : (vector<8xf32>) -> vector<8xf32> + %7 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %2) + ; CHECK: llvm.intr.nearbyint(%{{.*}}) : (vector<8xf64>) -> vector<8xf64> + %8 = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %3) + ret void +} +; CHECK-LABEL: llvm.func @lround_test +define void @lround_test(float %0, double %1) { + ; CHECK: llvm.intr.lround(%{{.*}}) : (f32) -> i32 + %3 = call i32 @llvm.lround.i32.f32(float %0) + ; CHECK: llvm.intr.lround(%{{.*}}) : (f32) -> i64 + %4 = call i64 @llvm.lround.i64.f32(float %0) + ; CHECK: llvm.intr.lround(%{{.*}}) : (f64) -> i32 + %5 = call i32 @llvm.lround.i32.f64(double %1) + ; CHECK: llvm.intr.lround(%{{.*}}) : (f64) -> i64 + %6 = call i64 @llvm.lround.i64.f64(double %1) + ret void +} +; CHECK-LABEL: llvm.func @llround_test +define void @llround_test(float %0, double %1) { + ; CHECK: llvm.intr.llround(%{{.*}}) : (f32) -> i64 + %3 = call i64 @llvm.llround.i64.f32(float %0) + ; CHECK: llvm.intr.llround(%{{.*}}) : (f64) -> i64 + %4 = call i64 @llvm.llround.i64.f64(double %1) + ret void +} +; CHECK-LABEL: llvm.func @lrint_test +define void @lrint_test(float %0, double %1) { + ; CHECK: llvm.intr.lrint(%{{.*}}) : (f32) -> i32 + %3 = call i32 @llvm.lrint.i32.f32(float %0) + ; CHECK: llvm.intr.lrint(%{{.*}}) : (f32) -> i64 + %4 = call i64 @llvm.lrint.i64.f32(float %0) + ; CHECK: llvm.intr.lrint(%{{.*}}) : (f64) -> i32 + %5 = call i32 @llvm.lrint.i32.f64(double %1) + ; CHECK: llvm.intr.lrint(%{{.*}}) : (f64) -> i64 + %6 = call i64 @llvm.lrint.i64.f64(double %1) + ret void +} +; CHECK-LABEL: llvm.func @llrint_test +define void @llrint_test(float %0, double %1) { + ; CHECK: llvm.intr.llrint(%{{.*}}) : (f32) -> i64 + %3 = call i64 @llvm.llrint.i64.f32(float %0) + ; CHECK: llvm.intr.llrint(%{{.*}}) : (f64) -> i64 + %4 = call i64 @llvm.llrint.i64.f64(double %1) + ret void +} + ; CHECK-LABEL: llvm.func @bitreverse_test define void @bitreverse_test(i32 %0, <8 x i32> %1) { ; CHECK: llvm.intr.bitreverse(%{{.*}}) : (i32) -> i32 @@ -781,6 +847,26 @@ declare float @llvm.copysign.f32(float, float) declare <8 x float> @llvm.copysign.v8f32(<8 x float>, <8 x float>) declare float @llvm.pow.f32(float, float) declare <8 x float> @llvm.pow.v8f32(<8 x float>, <8 x float>) +declare float @llvm.rint.f32(float) +declare double @llvm.rint.f64(double) +declare <8 x float> @llvm.rint.v8f32(<8 x float>) +declare <8 x double> @llvm.rint.v8f64(<8 x double>) +declare float @llvm.nearbyint.f32(float) +declare double @llvm.nearbyint.f64(double) +declare <8 x float> @llvm.nearbyint.v8f32(<8 x float>) +declare <8 x double> @llvm.nearbyint.v8f64(<8 x double>) +declare i32 @llvm.lround.i32.f32(float) +declare i64 @llvm.lround.i64.f32(float) +declare i32 @llvm.lround.i32.f64(double) +declare i64 @llvm.lround.i64.f64(double) +declare i64 @llvm.llround.i64.f32(float) +declare i64 @llvm.llround.i64.f64(double) +declare i32 @llvm.lrint.i32.f32(float) +declare i64 @llvm.lrint.i64.f32(float) +declare i32 @llvm.lrint.i32.f64(double) +declare i64 @llvm.lrint.i64.f64(double) +declare i64 @llvm.llrint.i64.f32(float) +declare i64 @llvm.llrint.i64.f64(double) declare i32 @llvm.bitreverse.i32(i32) declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) declare i32 @llvm.bswap.i32(i32) diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index c6a3c7fbb4450..ec619b9a9d367 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -134,6 +134,76 @@ llvm.func @pow_test(%arg0: f32, %arg1: f32, %arg2: vector<8xf32>, %arg3: vector< llvm.return } +// CHECK-LABEL: @rint_test +llvm.func @rint_test(%arg0 : f32, %arg1 : f64, %arg2 : vector<8xf32>, %arg3 : vector<8xf64>) { + // CHECK: call float @llvm.rint.f32 + "llvm.intr.rint"(%arg0) : (f32) -> f32 + // CHECK: call double @llvm.rint.f64 + "llvm.intr.rint"(%arg1) : (f64) -> f64 + // CHECK: call <8 x float> @llvm.rint.v8f32 + "llvm.intr.rint"(%arg2) : (vector<8xf32>) -> vector<8xf32> + // CHECK: call <8 x double> @llvm.rint.v8f64 + "llvm.intr.rint"(%arg3) : (vector<8xf64>) -> vector<8xf64> + llvm.return +} + +// CHECK-LABEL: @nearbyint_test +llvm.func @nearbyint_test(%arg0 : f32, %arg1 : f64, %arg2 : vector<8xf32>, %arg3 : vector<8xf64>) { + // CHECK: call float @llvm.nearbyint.f32 + "llvm.intr.nearbyint"(%arg0) : (f32) -> f32 + // CHECK: call double @llvm.nearbyint.f64 + "llvm.intr.nearbyint"(%arg1) : (f64) -> f64 + // CHECK: call <8 x float> @llvm.nearbyint.v8f32 + "llvm.intr.nearbyint"(%arg2) : (vector<8xf32>) -> vector<8xf32> + // CHECK: call <8 x double> @llvm.nearbyint.v8f64 + "llvm.intr.nearbyint"(%arg3) : (vector<8xf64>) -> vector<8xf64> + llvm.return +} + +// CHECK-LABEL: @lround_test +llvm.func @lround_test(%arg0 : f32, %arg1 : f64) { + // CHECK: call i32 @llvm.lround.i32.f32 + "llvm.intr.lround"(%arg0) : (f32) -> i32 + // CHECK: call i64 @llvm.lround.i64.f32 + "llvm.intr.lround"(%arg0) : (f32) -> i64 + // CHECK: call i32 @llvm.lround.i32.f64 + "llvm.intr.lround"(%arg1) : (f64) -> i32 + // CHECK: call i64 @llvm.lround.i64.f64 + "llvm.intr.lround"(%arg1) : (f64) -> i64 + llvm.return +} + +// CHECK-LABEL: @llround_test +llvm.func @llround_test(%arg0 : f32, %arg1 : f64) { + // CHECK: call i64 @llvm.llround.i64.f32 + "llvm.intr.llround"(%arg0) : (f32) -> i64 + // CHECK: call i64 @llvm.llround.i64.f64 + "llvm.intr.llround"(%arg1) : (f64) -> i64 + llvm.return +} + +// CHECK-LABEL: @lrint_test +llvm.func @lrint_test(%arg0 : f32, %arg1 : f64) { + // CHECK: call i32 @llvm.lrint.i32.f32 + "llvm.intr.lrint"(%arg0) : (f32) -> i32 + // CHECK: call i64 @llvm.lrint.i64.f32 + "llvm.intr.lrint"(%arg0) : (f32) -> i64 + // CHECK: call i32 @llvm.lrint.i32.f64 + "llvm.intr.lrint"(%arg1) : (f64) -> i32 + // CHECK: call i64 @llvm.lrint.i64.f64 + "llvm.intr.lrint"(%arg1) : (f64) -> i64 + llvm.return +} + +// CHECK-LABEL: @llrint_test +llvm.func @llrint_test(%arg0 : f32, %arg1 : f64) { + // CHECK: call i64 @llvm.llrint.i64.f32 + "llvm.intr.llrint"(%arg0) : (f32) -> i64 + // CHECK: call i64 @llvm.llrint.i64.f64 + "llvm.intr.llrint"(%arg1) : (f64) -> i64 + llvm.return +} + // CHECK-LABEL: @bitreverse_test llvm.func @bitreverse_test(%arg0: i32, %arg1: vector<8xi32>) { // CHECK: call i32 @llvm.bitreverse.i32 @@ -865,6 +935,26 @@ llvm.func @lifetime(%p: !llvm.ptr) { // CHECK-DAG: declare float @llvm.cos.f32(float) // CHECK-DAG: declare <8 x float> @llvm.cos.v8f32(<8 x float>) #0 // CHECK-DAG: declare float @llvm.copysign.f32(float, float) +// CHECK-DAG: declare float @llvm.rint.f32(float) +// CHECK-DAG: declare double @llvm.rint.f64(double) +// CHECK-DAG: declare <8 x float> @llvm.rint.v8f32(<8 x float>) +// CHECK-DAG: declare <8 x double> @llvm.rint.v8f64(<8 x double>) +// CHECK-DAG: declare float @llvm.nearbyint.f32(float) +// CHECK-DAG: declare double @llvm.nearbyint.f64(double) +// CHECK-DAG: declare <8 x float> @llvm.nearbyint.v8f32(<8 x float>) +// CHECK-DAG: declare <8 x double> @llvm.nearbyint.v8f64(<8 x double>) +// CHECK-DAG: declare i32 @llvm.lround.i32.f32(float) +// CHECK-DAG: declare i64 @llvm.lround.i64.f32(float) +// CHECK-DAG: declare i32 @llvm.lround.i32.f64(double) +// CHECK-DAG: declare i64 @llvm.lround.i64.f64(double) +// CHECK-DAG: declare i64 @llvm.llround.i64.f32(float) +// CHECK-DAG: declare i64 @llvm.llround.i64.f64(double) +// CHECK-DAG: declare i32 @llvm.lrint.i32.f32(float) +// CHECK-DAG: declare i64 @llvm.lrint.i64.f32(float) +// CHECK-DAG: declare i32 @llvm.lrint.i32.f64(double) +// CHECK-DAG: declare i64 @llvm.lrint.i64.f64(double) +// CHECK-DAG: declare i64 @llvm.llrint.i64.f32(float) +// CHECK-DAG: declare i64 @llvm.llrint.i64.f64(double) // CHECK-DAG: declare <12 x float> @llvm.matrix.multiply.v12f32.v64f32.v48f32(<64 x float>, <48 x float>, i32 immarg, i32 immarg, i32 immarg) // CHECK-DAG: declare <48 x float> @llvm.matrix.transpose.v48f32(<48 x float>, i32 immarg, i32 immarg) // CHECK-DAG: declare <48 x float> @llvm.matrix.column.major.load.v48f32.i64(ptr nocapture, i64, i1 immarg, i32 immarg, i32 immarg) From 728a7de88a780709c81476dd8e9287c09a0d1bcf Mon Sep 17 00:00:00 2001 From: Alex Brachet Date: Mon, 29 May 2023 16:31:04 +0000 Subject: [PATCH 011/704] Revert "[llvm-debuginfod][NFC] Switch to OptTable" This reverts commit 1610627d2b42c0daf8cc20649c50ddad222f001b. --- llvm/tools/llvm-debuginfod/CMakeLists.txt | 8 - llvm/tools/llvm-debuginfod/Opts.td | 20 --- .../tools/llvm-debuginfod/llvm-debuginfod.cpp | 140 +++++------------- .../llvm/tools/llvm-debuginfod/BUILD.gn | 8 - 4 files changed, 40 insertions(+), 136 deletions(-) delete mode 100644 llvm/tools/llvm-debuginfod/Opts.td diff --git a/llvm/tools/llvm-debuginfod/CMakeLists.txt b/llvm/tools/llvm-debuginfod/CMakeLists.txt index d32c6826d7687..72f2c19848489 100644 --- a/llvm/tools/llvm-debuginfod/CMakeLists.txt +++ b/llvm/tools/llvm-debuginfod/CMakeLists.txt @@ -1,16 +1,8 @@ set(LLVM_LINK_COMPONENTS - Option Support ) -set(LLVM_TARGET_DEFINITIONS Opts.td) -tablegen(LLVM Opts.inc -gen-opt-parser-defs) -add_public_tablegen_target(DebugInfodOptsTableGen) - add_llvm_tool(llvm-debuginfod llvm-debuginfod.cpp - - DEPENDS - DebugInfodOptsTableGen ) target_link_libraries(llvm-debuginfod PRIVATE LLVMDebuginfod) if(LLVM_INSTALL_BINUTILS_SYMLINKS) diff --git a/llvm/tools/llvm-debuginfod/Opts.td b/llvm/tools/llvm-debuginfod/Opts.td deleted file mode 100644 index 1de241a3fc2a1..0000000000000 --- a/llvm/tools/llvm-debuginfod/Opts.td +++ /dev/null @@ -1,20 +0,0 @@ -include "llvm/Option/OptParser.td" - -class F : Flag<["-"], name>, HelpText; -class FF: Flag<["--"], name>, HelpText; -class S: Separate<["-"], name>, HelpText, MetaVarName; - -def help : FF<"help", "Display available options">; -def : F<"h", "Alias for --help">, Alias; -def max_concurrency : - S<"c", "", "Maximum number of files to scan concurrently. " - "If 0, use the hardware concurrency.">; -def host_interface : S<"i", "", "Host interface to bind to.">; -def min_interval : - S<"m", "", "Minimum number of seconds to wait before an on-demand update can be" - "triggered by a request for a buildid which is not in the collection.">; -def port : S<"p", "", "Port to listen on. Set to 0 to bind to any available port.">; -def scan_interval : - S<"t", "", "Number of seconds to wait between subsequent " - "automated scans of the filesystem.">; -def verbose_logging : F<"v", "Enable verbose logging.">; diff --git a/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp b/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp index 7edc78e7f3f2a..c64d4dbb3155f 100644 --- a/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp +++ b/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp @@ -15,120 +15,60 @@ /// //===----------------------------------------------------------------------===// -#include "llvm/ADT/StringRef.h" #include "llvm/Debuginfod/Debuginfod.h" #include "llvm/Debuginfod/HTTPClient.h" -#include "llvm/Option/ArgList.h" -#include "llvm/Option/Option.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/ThreadPool.h" using namespace llvm; -// Command-line option boilerplate. -namespace { -enum ID { - OPT_INVALID = 0, // This is not an option ID. -#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR, VALUES) \ - OPT_##ID, -#include "Opts.inc" -#undef OPTION -}; - -#define PREFIX(NAME, VALUE) \ - static constexpr StringLiteral NAME##_init[] = VALUE; \ - static constexpr ArrayRef NAME(NAME##_init, \ - std::size(NAME##_init) - 1); -#include "Opts.inc" -#undef PREFIX - -static constexpr opt::OptTable::Info InfoTable[] = { -#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR, VALUES) \ - { \ - PREFIX, NAME, HELPTEXT, \ - METAVAR, OPT_##ID, opt::Option::KIND##Class, \ - PARAM, FLAGS, OPT_##GROUP, \ - OPT_##ALIAS, ALIASARGS, VALUES}, -#include "Opts.inc" -#undef OPTION -}; - -class DebuginfodOptTable : public opt::GenericOptTable { -public: - DebuginfodOptTable() : GenericOptTable(InfoTable) {} -}; -} // end anonymous namespace - -// Options -static unsigned Port; -static std::string HostInterface; -static int ScanInterval; -static double MinInterval; -static size_t MaxConcurrency; -static bool VerboseLogging; -static std::vector ScanPaths; +cl::OptionCategory DebuginfodCategory("llvm-debuginfod Options"); + +static cl::list ScanPaths(cl::Positional, + cl::desc(""), + cl::cat(DebuginfodCategory)); + +static cl::opt + Port("p", cl::init(0), + cl::desc("Port to listen on. Set to 0 to bind to any available port."), + cl::cat(DebuginfodCategory)); + +static cl::opt + HostInterface("i", cl::init("0.0.0.0"), + cl::desc("Host interface to bind to."), + cl::cat(DebuginfodCategory)); + +static cl::opt + ScanInterval("t", cl::init(300), + cl::desc("Number of seconds to wait between subsequent " + "automated scans of the filesystem."), + cl::cat(DebuginfodCategory)); + +static cl::opt MinInterval( + "m", cl::init(10), + cl::desc( + "Minimum number of seconds to wait before an on-demand update can be " + "triggered by a request for a buildid which is not in the collection."), + cl::cat(DebuginfodCategory)); + +static cl::opt + MaxConcurrency("c", cl::init(0), + cl::desc("Maximum number of files to scan concurrently. If " + "0, use the hardware concurrency."), + cl::cat(DebuginfodCategory)); + +static cl::opt VerboseLogging("v", cl::init(false), + cl::desc("Enable verbose logging."), + cl::cat(DebuginfodCategory)); ExitOnError ExitOnErr; -template -static void parseIntArg(const opt::InputArgList &Args, int ID, T &Value, - T Default) { - if (const opt::Arg *A = Args.getLastArg(ID)) { - StringRef V(A->getValue()); - if (!llvm::to_integer(V, Value, 0)) { - errs() << A->getSpelling() + ": expected an integer, but got '" + V + "'"; - exit(1); - } - } else { - Value = Default; - } -} - -static void parseArgs(int argc, char **argv) { - DebuginfodOptTable Tbl; - llvm::StringRef ToolName = argv[0]; - llvm::BumpPtrAllocator A; - llvm::StringSaver Saver{A}; - opt::InputArgList Args = - Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) { - llvm::errs() << Msg << '\n'; - std::exit(1); - }); - - if (Args.hasArg(OPT_help)) { - Tbl.printHelp(llvm::outs(), - "llvm-debuginfod [options] ", - ToolName.str().c_str()); - std::exit(0); - } - - VerboseLogging = Args.hasArg(OPT_verbose_logging); - ScanPaths = Args.getAllArgValues(OPT_INPUT); - - parseIntArg(Args, OPT_port, Port, 0u); - parseIntArg(Args, OPT_scan_interval, ScanInterval, 300); - parseIntArg(Args, OPT_max_concurrency, MaxConcurrency, 0ul); - - if (const opt::Arg *A = Args.getLastArg(OPT_min_interval)) { - StringRef V(A->getValue()); - if (!llvm::to_float(V, MinInterval)) { - errs() << A->getSpelling() + ": expected a number, but got '" + V + "'"; - exit(1); - } - } else { - MinInterval = 10.0; - } - - HostInterface = Args.getLastArgValue(OPT_host_interface, "0.0.0.0"); -} - int main(int argc, char **argv) { InitLLVM X(argc, argv); HTTPClient::initialize(); - parseArgs(argc, argv); + cl::HideUnrelatedOptions({&DebuginfodCategory}); + cl::ParseCommandLineOptions(argc, argv); SmallVector Paths; for (const std::string &Path : ScanPaths) diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod/BUILD.gn index 236124f351bf7..c8ee330a867cb 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod/BUILD.gn @@ -1,12 +1,6 @@ import("//llvm/tools/binutils_symlinks.gni") -import("//llvm/utils/TableGen/tablegen.gni") import("//llvm/utils/gn/build/symlink_or_copy.gni") -tablegen("Opts") { - visibility = [ ":llvm-debuginfod" ] - args = [ "-gen-opt-parser-defs" ] -} - if (llvm_install_binutils_symlinks) { symlink_or_copy("debuginfod") { deps = [ ":llvm-debuginfod" ] @@ -25,9 +19,7 @@ group("symlinks") { executable("llvm-debuginfod") { deps = [ - ":Opts", "//llvm/lib/Debuginfod", - "//llvm/lib/Option", "//llvm/lib/Support", ] sources = [ "llvm-debuginfod.cpp" ] From cd21c0d30c0803b24e1dcf71d2f04cb46e539905 Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool Date: Mon, 29 May 2023 09:35:19 -0700 Subject: [PATCH 012/704] Revert "Revert "Host: generalise `GetXcodeSDKPath`"" This reverts commit c46d9af26cefb0b24646d3235b75ae7a1b8548d4. Rename the variable to avoid `-Wchanges-meaning` warning. Although, it might be better to squelch the warning as it is of low value IMO. --- lldb/include/lldb/Host/HostInfoBase.h | 29 ++++++++++++++++--- .../include/lldb/Host/macosx/HostInfoMacOSX.h | 2 +- lldb/source/Core/Module.cpp | 4 +-- .../Host/macosx/objcxx/HostInfoMacOSX.mm | 10 +++++-- .../MacOSX/PlatformAppleSimulator.cpp | 3 +- .../Platform/MacOSX/PlatformMacOSX.cpp | 3 +- lldb/unittests/Host/HostInfoTest.cpp | 3 +- 7 files changed, 42 insertions(+), 12 deletions(-) diff --git a/lldb/include/lldb/Host/HostInfoBase.h b/lldb/include/lldb/Host/HostInfoBase.h index 6c86c71e552dc..4082cd7f62bc6 100644 --- a/lldb/include/lldb/Host/HostInfoBase.h +++ b/lldb/include/lldb/Host/HostInfoBase.h @@ -31,6 +31,23 @@ struct SharedCacheImageInfo { lldb::DataBufferSP data_sp; }; +namespace { +struct HostInfoError : public llvm::ErrorInfo { + static char ID; + const std::string message_; + + HostInfoError(const std::string message) : message_(std::move(message)) {} + + void log(llvm::raw_ostream &OS) const override { OS << "HostInfoError"; } + + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; + +char HostInfoError::ID = 0; +} // namespace + class HostInfoBase { private: // Static class, unconstructable. @@ -108,10 +125,14 @@ class HostInfoBase { static FileSpec GetXcodeContentsDirectory() { return {}; } static FileSpec GetXcodeDeveloperDirectory() { return {}; } - - /// Return the directory containing a specific Xcode SDK. - static llvm::Expected GetXcodeSDKPath(XcodeSDK sdk) { - return ""; + + struct SDKOptions { + std::optional XcodeSDKSelection; + }; + + /// Return the directory containing something like a SDK (reused for Swift). + static llvm::Expected GetSDKRoot(SDKOptions options) { + return llvm::make_error("cannot determine SDK root"); } /// Return information about module \p image_name if it is loaded in diff --git a/lldb/include/lldb/Host/macosx/HostInfoMacOSX.h b/lldb/include/lldb/Host/macosx/HostInfoMacOSX.h index 0402509cfa261..74d979d965a73 100644 --- a/lldb/include/lldb/Host/macosx/HostInfoMacOSX.h +++ b/lldb/include/lldb/Host/macosx/HostInfoMacOSX.h @@ -31,7 +31,7 @@ class HostInfoMacOSX : public HostInfoPosix { static FileSpec GetXcodeDeveloperDirectory(); /// Query xcrun to find an Xcode SDK directory. - static llvm::Expected GetXcodeSDKPath(XcodeSDK sdk); + static llvm::Expected GetSDKRoot(SDKOptions options); /// Shared cache utilities static SharedCacheImageInfo diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp index d527bdc76309d..a4ba09ef56418 100644 --- a/lldb/source/Core/Module.cpp +++ b/lldb/source/Core/Module.cpp @@ -1607,8 +1607,8 @@ std::optional Module::RemapSourceFile(llvm::StringRef path) const { void Module::RegisterXcodeSDK(llvm::StringRef sdk_name, llvm::StringRef sysroot) { - XcodeSDK sdk(sdk_name.str()); - auto sdk_path_or_err = HostInfo::GetXcodeSDKPath(sdk); + auto sdk_path_or_err = + HostInfo::GetSDKRoot(HostInfo::SDKOptions{sdk_name.str()}); if (!sdk_path_or_err) { Debugger::ReportError("Error while searching for Xcode SDK: " + diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm index 5a39ed370747a..0b4fc1885cae9 100644 --- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm +++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm @@ -338,7 +338,8 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) { } } - auto sdk_path_or_err = HostInfo::GetXcodeSDKPath(XcodeSDK::GetAnyMacOS()); + auto sdk_path_or_err = + HostInfo::GetSDKRoot(SDKOptions{XcodeSDK::GetAnyMacOS()}); if (!sdk_path_or_err) { Log *log = GetLog(LLDBLog::Host); LLDB_LOGF(log, "Error while searching for Xcode SDK: %s", @@ -519,7 +520,7 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) { return path; } -llvm::Expected HostInfoMacOSX::GetXcodeSDKPath(XcodeSDK sdk) { +llvm::Expected HostInfoMacOSX::GetSDKRoot(SDKOptions options) { struct ErrorOrPath { std::string str; bool is_error; @@ -530,6 +531,11 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) { std::lock_guard guard(g_sdk_path_mutex); LLDB_SCOPED_TIMER(); + if (!options.XcodeSDKSelection) + return llvm::createStringError(llvm::inconvertibleErrorCode(), + "XCodeSDK not specified"); + XcodeSDK sdk = *options.XcodeSDKSelection; + auto key = sdk.GetString(); auto it = g_sdk_path.find(key); if (it != g_sdk_path.end()) { diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp index 7501f3edd24ff..7044426e17b50 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformAppleSimulator.cpp @@ -284,7 +284,8 @@ static llvm::StringRef GetXcodeSDKDir(std::string preferred, std::string secondary) { llvm::StringRef sdk; auto get_sdk = [&](std::string sdk) -> llvm::StringRef { - auto sdk_path_or_err = HostInfo::GetXcodeSDKPath(XcodeSDK(std::move(sdk))); + auto sdk_path_or_err = + HostInfo::GetSDKRoot(HostInfo::SDKOptions{XcodeSDK(std::move(sdk))}); if (!sdk_path_or_err) { Debugger::ReportError("Error while searching for Xcode SDK: " + toString(sdk_path_or_err.takeError())); diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp index 25b821d12a314..ba412da62e57b 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp @@ -124,7 +124,8 @@ ConstString PlatformMacOSX::GetSDKDirectory(lldb_private::Target &target) { } // Use the default SDK as a fallback. - auto sdk_path_or_err = HostInfo::GetXcodeSDKPath(XcodeSDK::GetAnyMacOS()); + auto sdk_path_or_err = + HostInfo::GetSDKRoot(HostInfo::SDKOptions{XcodeSDK::GetAnyMacOS()}); if (!sdk_path_or_err) { Debugger::ReportError("Error while searching for Xcode SDK: " + toString(sdk_path_or_err.takeError())); diff --git a/lldb/unittests/Host/HostInfoTest.cpp b/lldb/unittests/Host/HostInfoTest.cpp index b6c8edd74154f..322675c7f485a 100644 --- a/lldb/unittests/Host/HostInfoTest.cpp +++ b/lldb/unittests/Host/HostInfoTest.cpp @@ -57,7 +57,8 @@ TEST_F(HostInfoTest, GetHostname) { #if defined(__APPLE__) TEST_F(HostInfoTest, GetXcodeSDK) { auto get_sdk = [](std::string sdk, bool error = false) -> llvm::StringRef { - auto sdk_path_or_err = HostInfo::GetXcodeSDKPath(XcodeSDK(std::move(sdk))); + auto sdk_path_or_err = + HostInfo::GetSDKRoot(HostInfo::SDKOptions{XcodeSDK(std::move(sdk))}); if (!error) { EXPECT_TRUE((bool)sdk_path_or_err); return *sdk_path_or_err; From 482ee33a637d1abaf3c9d5908f2653997e5842a2 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Tue, 21 Mar 2023 20:01:22 +0100 Subject: [PATCH 013/704] [lld] Use correct machine type in ARM64EC COFF headers. This adds very minimal support for ARM64EC/ARM64X targets, just enough for interesting test cases. Next patches in the series extend llvm-objdump and llvm-readobj to provide better tests. Those will also be useful for testing further ARM64EC LLD support. Differential Revision: https://reviews.llvm.org/D149086 --- lld/COFF/Chunks.cpp | 4 ++++ lld/COFF/Config.h | 6 +++++- lld/COFF/Writer.cpp | 11 ++++++++++- lld/test/COFF/arm64ec.test | 23 +++++++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 lld/test/COFF/arm64ec.test diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index 7ec4829599906..8ffe79f139ff0 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -447,6 +447,8 @@ void SectionChunk::applyRelocation(uint8_t *off, applyRelARM(off, rel.Type, os, s, p, imageBase); break; case ARM64: + case ARM64EC: + case ARM64X: applyRelARM64(off, rel.Type, os, s, p, imageBase); break; default: @@ -532,6 +534,8 @@ static uint8_t getBaserelType(const coff_relocation &rel, return IMAGE_REL_BASED_ARM_MOV32T; return IMAGE_REL_BASED_ABSOLUTE; case ARM64: + case ARM64EC: + case ARM64X: if (rel.Type == IMAGE_REL_ARM64_ADDR64) return IMAGE_REL_BASED_DIR64; return IMAGE_REL_BASED_ABSOLUTE; diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h index 028cb9c13bafa..029c233e4544a 100644 --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -37,6 +37,8 @@ class SectionChunk; // Short aliases. static const auto AMD64 = llvm::COFF::IMAGE_FILE_MACHINE_AMD64; static const auto ARM64 = llvm::COFF::IMAGE_FILE_MACHINE_ARM64; +static const auto ARM64EC = llvm::COFF::IMAGE_FILE_MACHINE_ARM64EC; +static const auto ARM64X = llvm::COFF::IMAGE_FILE_MACHINE_ARM64X; static const auto ARMNT = llvm::COFF::IMAGE_FILE_MACHINE_ARMNT; static const auto I386 = llvm::COFF::IMAGE_FILE_MACHINE_I386; @@ -96,7 +98,9 @@ enum class ICFLevel { // Global configuration. struct Configuration { enum ManifestKind { Default, SideBySide, Embed, No }; - bool is64() const { return machine == AMD64 || machine == ARM64; } + bool is64() const { + return machine == AMD64 || llvm::COFF::isAnyArm64(machine); + } llvm::COFF::MachineTypes machine = IMAGE_FILE_MACHINE_UNKNOWN; size_t wordsize; diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index fe9fb18c7855b..6498cfef27592 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -1436,7 +1436,16 @@ template void Writer::writeHeader() { // Write COFF header auto *coff = reinterpret_cast(buf); buf += sizeof(*coff); - coff->Machine = config->machine; + switch (config->machine) { + case ARM64EC: + coff->Machine = AMD64; + break; + case ARM64X: + coff->Machine = ARM64; + break; + default: + coff->Machine = config->machine; + } coff->NumberOfSections = ctx.outputSections.size(); coff->Characteristics = IMAGE_FILE_EXECUTABLE_IMAGE; if (config->largeAddressAware) diff --git a/lld/test/COFF/arm64ec.test b/lld/test/COFF/arm64ec.test new file mode 100644 index 0000000000000..09e6975304e8d --- /dev/null +++ b/lld/test/COFF/arm64ec.test @@ -0,0 +1,23 @@ +REQUIRES: aarch64 +RUN: split-file %s %t.dir && cd %t.dir + +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-data-sym.s -o arm64ec-data-sym.obj +RUN: llvm-cvtres -machine:arm64x -out:arm64x-resource.obj %S/Inputs/resource.res + +RUN: lld-link -out:test.dll -machine:arm64ec arm64ec-data-sym.obj -dll -noentry +RUN: llvm-readobj --file-headers test.dll | FileCheck -check-prefix=ARM64EC-HEADER %s +ARM64EC-HEADER: Format: COFF-x86-64 +ARM64EC-HEADER-NEXT: Arch: x86_64 +ARM64EC-HEADER-NEXT: AddressSize: 64bit +ARM64EC-HEADER: Machine: IMAGE_FILE_MACHINE_AMD64 (0x8664) + +RUN: lld-link -out:test.dll -machine:arm64x arm64x-resource.obj -dll -noentry +RUN: llvm-readobj --file-headers test.dll | FileCheck -check-prefix=ARM64X-HEADER %s +ARM64X-HEADER: Machine: IMAGE_FILE_MACHINE_ARM64 (0xAA64) + +#--- arm64ec-data-sym.s + .data + .globl arm64ec_data_sym + .p2align 2, 0x0 +arm64ec_data_sym: + .word 0x02020202 From cb227c944f2704ffe061a94bdb68e8a066cf141b Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Wed, 5 Apr 2023 23:03:15 +0200 Subject: [PATCH 014/704] [lld] Allow input files from compatible architectures on EC targets. Differential Revision: https://reviews.llvm.org/D149087 --- lld/COFF/SymbolTable.cpp | 17 +++++++++++++- lld/test/COFF/arm64ec.test | 47 +++++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index 2ca7b82cac4e3..16d03754cc699 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -34,6 +34,21 @@ StringRef ltrim1(StringRef s, const char *chars) { return s; } +static bool compatibleMachineType(COFFLinkerContext &ctx, MachineTypes mt) { + if (mt == IMAGE_FILE_MACHINE_UNKNOWN) + return true; + switch (ctx.config.machine) { + case ARM64: + return mt == ARM64 || mt == ARM64X; + case ARM64EC: + return COFF::isArm64EC(mt) || mt == AMD64; + case ARM64X: + return COFF::isAnyArm64(mt) || mt == AMD64; + default: + return ctx.config.machine == mt; + } +} + void SymbolTable::addFile(InputFile *file) { log("Reading " + toString(file)); if (file->lazy) { @@ -56,7 +71,7 @@ void SymbolTable::addFile(InputFile *file) { if (ctx.config.machine == IMAGE_FILE_MACHINE_UNKNOWN) { ctx.config.machine = mt; ctx.driver.addWinSysRootLibSearchPaths(); - } else if (mt != IMAGE_FILE_MACHINE_UNKNOWN && ctx.config.machine != mt) { + } else if (!compatibleMachineType(ctx, mt)) { error(toString(file) + ": machine type " + machineToStr(mt) + " conflicts with " + machineToStr(ctx.config.machine)); return; diff --git a/lld/test/COFF/arm64ec.test b/lld/test/COFF/arm64ec.test index 09e6975304e8d..e50b14ce0184c 100644 --- a/lld/test/COFF/arm64ec.test +++ b/lld/test/COFF/arm64ec.test @@ -1,7 +1,9 @@ -REQUIRES: aarch64 +REQUIRES: aarch64, x86 RUN: split-file %s %t.dir && cd %t.dir +RUN: llvm-mc -filetype=obj -triple=aarch64-windows arm64-data-sym.s -o arm64-data-sym.obj RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-data-sym.s -o arm64ec-data-sym.obj +RUN: llvm-mc -filetype=obj -triple=x86_64-windows x86_64-data-sym.s -o x86_64-data-sym.obj RUN: llvm-cvtres -machine:arm64x -out:arm64x-resource.obj %S/Inputs/resource.res RUN: lld-link -out:test.dll -machine:arm64ec arm64ec-data-sym.obj -dll -noentry @@ -15,9 +17,52 @@ RUN: lld-link -out:test.dll -machine:arm64x arm64x-resource.obj -dll -noentry RUN: llvm-readobj --file-headers test.dll | FileCheck -check-prefix=ARM64X-HEADER %s ARM64X-HEADER: Machine: IMAGE_FILE_MACHINE_ARM64 (0xAA64) +arm64x object files are allowed with -machine:arm64 as well +RUN: lld-link -out:test.dll -machine:arm64 arm64x-resource.obj -dll -noentry + +RUN: lld-link -out:test.dll -machine:arm64ec arm64ec-data-sym.obj x86_64-data-sym.obj \ +RUN: arm64x-resource.obj -dll -noentry +RUN: llvm-readobj --file-headers test.dll | FileCheck -check-prefix=ARM64EC-HEADER %s + +RUN: llvm-readobj --hex-dump=.data test.dll | FileCheck -check-prefix=ARM64EC-DATA %s +ARM64EC-DATA: 02020202 03030303 + +RUN: lld-link -out:test.dll -machine:arm64x x86_64-data-sym.obj arm64-data-sym.obj \ +RUN: arm64ec-data-sym.obj arm64x-resource.obj -dll -noentry +RUN: llvm-readobj --file-headers test.dll | FileCheck -check-prefix=ARM64X-HEADER %s + +RUN: llvm-readobj --hex-dump=.data test.dll | FileCheck -check-prefix=ARM64X-DATA %s +ARM64X-DATA: 03030303 01010101 02020202 + +RUN: not lld-link -out:test.dll -machine:arm64 arm64-data-sym.obj arm64ec-data-sym.obj \ +RUN: -dll -noentry 2>&1 | FileCheck -check-prefix=INCOMPAT1 %s +INCOMPAT1: lld-link: error: arm64ec-data-sym.obj: machine type arm64ec conflicts with arm64 + +RUN: not lld-link -out:test.dll -machine:arm64ec arm64ec-data-sym.obj arm64-data-sym.obj \ +RUN: -dll -noentry 2>&1 | FileCheck -check-prefix=INCOMPAT2 %s +INCOMPAT2: lld-link: error: arm64-data-sym.obj: machine type arm64 conflicts with arm64ec + +RUN: not lld-link -out:test.dll -machine:arm64 arm64-data-sym.obj x86_64-data-sym.obj \ +RUN: -dll -noentry 2>&1 | FileCheck -check-prefix=INCOMPAT3 %s +INCOMPAT3: lld-link: error: x86_64-data-sym.obj: machine type x64 conflicts with arm64 + #--- arm64ec-data-sym.s .data .globl arm64ec_data_sym .p2align 2, 0x0 arm64ec_data_sym: .word 0x02020202 + +#--- arm64-data-sym.s + .data + .globl arm64_data_sym + .p2align 2, 0x0 +arm64_data_sym: + .word 0x01010101 + +#--- x86_64-data-sym.s + .data + .globl x86_64_data_sym + .p2align 2, 0x0 +x86_64_data_sym: + .long 0x03030303 From 72c17a0d660ae888010b65486014306b91eb9c4f Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 10:53:43 -0700 Subject: [PATCH 015/704] [AArch64] Remove unused declaration LowerSCALAR_TO_VECTOR The corresponding function definition was removed by: commit 85d6a16c46616336c52792e7a490e70ea8b6461e Author: Tim Northover Date: Fri Apr 4 09:03:09 2014 +0000 --- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2430ad9230138..689c2d1860064 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1053,7 +1053,6 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; From c65fd1cef958789b6f21b280cde8763158d816e9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 10:53:45 -0700 Subject: [PATCH 016/704] [AArch64] Remove unused declaration legalizeShuffleVector The declaration was added without a corresponding function definition by: commit 4c52fb1a5ee20846627d16e38f5dec08c08f8884 Author: Vladislav Dzhidzhoev Date: Mon Aug 15 21:51:13 2022 +0300 --- llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index 2a13a3606d23c..c10f6e071ed43 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -47,7 +47,6 @@ class AArch64LegalizerInfo : public LegalizerInfo { MachineIRBuilder &MIRBuilder, GISelChangeObserver &Observer) const; bool legalizeVectorTrunc(MachineInstr &MI, LegalizerHelper &Helper) const; - bool legalizeShuffleVector(MachineInstr &MI, LegalizerHelper &Helper) const; bool legalizeBitfieldExtract(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const; bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI, From d035489297b925edf1ae2412b718b88da5dbb76e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 10:53:46 -0700 Subject: [PATCH 017/704] [AArch64] Remove unused declaration tryParseMRSSystemRegister The corresponding definition was removed by: commit e8b4166accfa03b9693bea34e53eb384f4b99198 Author: Bradley Smith Date: Wed Apr 9 14:43:06 2014 +0000 --- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 39e02fc41d209..beb360342f913 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -248,7 +248,6 @@ class AArch64AsmParser : public MCTargetAsmParser { OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands); OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands); OperandMatchResultTy tryParseBarriernXSOperand(OperandVector &Operands); - OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands); OperandMatchResultTy tryParseSysReg(OperandVector &Operands); OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); template From 5f6c571b6f1f5fd3fefb1a3be33d80bd12fb87b8 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 11:01:53 -0700 Subject: [PATCH 018/704] [DebugInfo] Remove unused declaration getFirstFunctionInfoAddress The declaration was added without a corresponding function definition by: commit d8e077e2caebc1415fb7af1714dd436adf99b6bf Author: Greg Clayton Date: Mon Mar 6 15:57:17 2023 -0800 --- llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h b/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h index c7cee0593c035..bca3a83cc6850 100644 --- a/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h +++ b/llvm/include/llvm/DebugInfo/GSYM/GsymCreator.h @@ -264,14 +264,6 @@ class GsymCreator { /// and strings over into this object. void fixupInlineInfo(const GsymCreator &SrcGC, InlineInfo &II); - /// Get the first function info address from this GSYM file. - /// - /// This is used to add a suffix to segmented GSYM files to indicate the first - /// address for the first function info within the file. - /// - /// \returns The first function info address. - uint64_t getFirstFunctionInfoAddress() const; - /// Save this GSYM file into segments that are roughly \a SegmentSize in size. /// /// When segemented GSYM files are saved to disk, they will use \a Path as a From 446efd29b666e14d560ad11b328e84ff62a2f5fb Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 11:01:54 -0700 Subject: [PATCH 019/704] [ExecutionEngine] Remove unused declaration defineNonExistent The declaration was added without a corresponding function definition by: commit cb84e4827e43921659e75509dfb42ebf56c50502 Author: Lang Hames Date: Wed Mar 25 13:07:00 2020 -0700 --- llvm/include/llvm/ExecutionEngine/Orc/Core.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h index 2c6484f1795cd..c51a15c8ed375 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h @@ -607,20 +607,6 @@ class MaterializationResponsibility { /// callbacks, metadata). Error defineMaterializing(SymbolFlagsMap SymbolFlags); - /// Define the given symbols as non-existent, removing it from the symbol - /// table and notifying any pending queries. Queries that lookup up the - /// symbol using the SymbolLookupFlags::WeaklyReferencedSymbol flag will - /// behave as if the symbol had not been matched in the first place. Queries - /// that required this symbol will fail with a missing symbol definition - /// error. - /// - /// This method is intended to support cleanup of special symbols like - /// initializer symbols: Queries using - /// SymbolLookupFlags::WeaklyReferencedSymbol can be used to trigger their - /// emission, and this method can be used to remove them from the JITDylib - /// once materialization is complete. - void defineNonExistent(ArrayRef Symbols); - /// Notify all not-yet-emitted covered by this MaterializationResponsibility /// instance that an error has occurred. /// This will remove all symbols covered by this MaterializationResponsibilty From 23b9faddeaccf2b71b45a9c5fa804e46d1d528ce Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 11:01:56 -0700 Subject: [PATCH 020/704] [ExecutionEngine] Remove unused declaration recordRuntimeRegistrationFunctions The declaration was added without a corresponding function definition by: commit 217fd59cc5849f1afdbb8a74807a80b23f2da227 Author: Lang Hames Date: Thu Jan 12 23:17:46 2023 -0800 --- llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h index 4800686d817d5..15dae6f920d57 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h @@ -165,8 +165,6 @@ class MachOPlatform : public Platform { Error bootstrapPipelineRecordRuntimeFunctions(jitlink::LinkGraph &G); Error bootstrapPipelineEnd(jitlink::LinkGraph &G); - Error recordRuntimeRegistrationFunctions(jitlink::LinkGraph &G); - Error associateJITDylibHeaderSymbol(jitlink::LinkGraph &G, MaterializationResponsibility &MR); From fcc135a8d6a7fb3600889778645fd7c0ed12a399 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 29 May 2023 19:23:51 +0100 Subject: [PATCH 021/704] [LV] Remove dead CHECK lines after 280656eae95a9cbf. Those check lines were left over after adding new run lines in 280656eae95a9cbf. --- .../LoopVectorize/uniform_across_vf_induction2.ll | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll index f603357b827c2..5ac38497f8a11 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll @@ -4,9 +4,7 @@ ; for (iv = 0, iv2 = 0 ; ; iv += 1, iv2 += 1) B[iv] = A[iv/1 + iv2/1] + 42; define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { -; CHECK-LABEL: define void @ld_div1_step1_start0_ind2 -; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { -; ; VF2-LABEL: define void @ld_div1_step1_start0_ind2 +; VF2-LABEL: define void @ld_div1_step1_start0_ind2 ; VF2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { ; VF2-NEXT: entry: ; VF2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -149,8 +147,6 @@ exit: ; for (iv = 0, iv2 = 0 ; ; iv += 1, iv2 += 1) B[iv] = A[iv/2 + iv2/2] + 42; define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) { -; CHECK-LABEL: define void @ld_div2_step1_start0_ind2 -; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { ; VF2-LABEL: define void @ld_div2_step1_start0_ind2 ; VF2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { ; VF2-NEXT: entry: From da54bd230aa45558945b53d2113a35aafb9650fb Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Mon, 29 May 2023 14:51:08 -0400 Subject: [PATCH 022/704] [gn build] (manually) port 9f6250f --- .../secondary/clang/include/clang/Basic/BUILD.gn | 15 +++++++++++++++ llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn | 3 +++ .../utils/gn/secondary/clang/lib/Headers/BUILD.gn | 7 +++++++ 3 files changed, 25 insertions(+) diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn index 40b7ba3780c97..7463ea457161e 100644 --- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn @@ -127,6 +127,21 @@ clang_tablegen("arm_sve_sema_rangechecks") { td_file = "arm_sve.td" } +clang_tablegen("arm_sme_builtins") { + args = [ "-gen-arm-sme-builtins" ] + td_file = "arm_sme.td" +} + +clang_tablegen("arm_sme_builtin_cg") { + args = [ "-gen-arm-sme-builtin-codegen" ] + td_file = "arm_sme.td" +} + +clang_tablegen("arm_sme_sema_rangechecks") { + args = [ "-gen-arm-sme-sema-rangechecks" ] + td_file = "arm_sme.td" +} + clang_tablegen("arm_cde_builtins") { args = [ "-gen-arm-cde-builtin-def" ] td_file = "arm_cde.td" diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index 1f7957e4cb490..ab89fac29c4e6 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -26,6 +26,9 @@ static_library("Basic") { "//clang/include/clang/Basic:DiagnosticGroups", "//clang/include/clang/Basic:arm_cde_builtins", "//clang/include/clang/Basic:arm_mve_builtins", + "//clang/include/clang/Basic:arm_sme_builtins", + "//clang/include/clang/Basic:arm_sme_builtin_cg", + "//clang/include/clang/Basic:arm_sme_sema_rangechecks", "//clang/include/clang/Basic:arm_sve_builtins", "//clang/include/clang/Basic:arm_sve_typeflags", "//clang/include/clang/Basic:diags_tablegen", diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index 1c77969c2631f..a8a193876061f 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -22,6 +22,13 @@ clang_tablegen("arm_sve") { output_name = "arm_sve.h" } +# Generate arm_sme_draft_spec_subject_to_change.h +clang_tablegen("arm_sme_draft_spec_subject_to_change") { + args = [ "-gen-arm-sme-header" ] + td_file = "//clang/include/clang/Basic/arm_sme.td" + output_name = "arm_sme_draft_spec_subject_to_change.h" +} + # Generate arm_bf16.h clang_tablegen("arm_bf16") { args = [ "-gen-arm-bf16" ] From 75c75215e37bfa4ac9cd198d0b1489379a9e6281 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 11:56:50 -0700 Subject: [PATCH 023/704] [Vectorize] Remove unused declaration requiresTooManyRuntimeChecks The corresponding function definition was removed by: commit 644a965c1efef68f22d9495e4cefbb599c214788 Author: Florian Hahn Date: Mon Jul 4 15:10:48 2022 +0100 --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index d67772129cefd..bce3ccae50126 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -353,9 +353,6 @@ class LoopVectorizationPlanner { getDecisionAndClampRange(const std::function &Predicate, VFRange &Range); - /// Check if the number of runtime checks exceeds the threshold. - bool requiresTooManyRuntimeChecks() const; - /// \return The most profitable vectorization factor and the cost of that VF /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if /// epilogue vectorization is not supported for the loop. From e9f14d49458b492a3d993328da7db9ff14c44f9f Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 11:56:52 -0700 Subject: [PATCH 024/704] [bugpoint] Remove unused declaration debugPassMiscompilation The declaration seems to have been unused for at least 15 years. --- llvm/tools/bugpoint/BugDriver.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/llvm/tools/bugpoint/BugDriver.h b/llvm/tools/bugpoint/BugDriver.h index b7c9edc5b8121..9fb0880b9cd01 100644 --- a/llvm/tools/bugpoint/BugDriver.h +++ b/llvm/tools/bugpoint/BugDriver.h @@ -101,15 +101,6 @@ class BugDriver { /// input. Error debugMiscompilation(); - /// debugPassMiscompilation - This method is called when the specified pass - /// miscompiles Program as input. It tries to reduce the testcase to - /// something that smaller that still miscompiles the program. - /// ReferenceOutput contains the filename of the file containing the output we - /// are to match. - /// - bool debugPassMiscompilation(const PassInfo *ThePass, - const std::string &ReferenceOutput); - /// compileSharedObject - This method creates a SharedObject from a given /// BitcodeFile for debugging a code generator. /// From 5f2e98051db1ead2c30f5740d68a36d6af829d4f Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 11:56:53 -0700 Subject: [PATCH 025/704] [Coroutines] Remove unused variable AsyncFuncTy The last use was removed by: commit e760ec2a01fba0d90e2fea33cf99b75baa2c2a1c Author: Arnold Schwaighofer Date: Thu Feb 11 08:25:31 2021 -0800 --- llvm/lib/Transforms/Coroutines/CoroInternal.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h index 372090356e245..067fb6bba47e3 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -127,7 +127,6 @@ struct LLVM_LIBRARY_VISIBILITY Shape { }; struct AsyncLoweringStorage { - FunctionType *AsyncFuncTy; Value *Context; CallingConv::ID AsyncCC; unsigned ContextArgNo; From dc61666042443f215f1f144516bc3ac674f5d1c6 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 12:19:13 -0700 Subject: [PATCH 026/704] [Analysis] Remove unused declarations visitEHBeginCatch and visitEHEndCatch The corresponding function definitions were removed by: commit 14e773500e036de57ed0ca4af6fddc1f8b6767d8 Author: Reid Kleckner Date: Fri Oct 9 23:34:53 2015 +0000 --- llvm/lib/Analysis/Lint.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 07bf9bda7f1c3..ff022006df65a 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -93,8 +93,6 @@ class Lint : public InstVisitor { void visitCallBase(CallBase &CB); void visitMemoryReference(Instruction &I, const MemoryLocation &Loc, MaybeAlign Alignment, Type *Ty, unsigned Flags); - void visitEHBeginCatch(IntrinsicInst *II); - void visitEHEndCatch(IntrinsicInst *II); void visitReturnInst(ReturnInst &I); void visitLoadInst(LoadInst &I); From 638112737efc3580391a645f301e5c349ff9d043 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 12:19:14 -0700 Subject: [PATCH 027/704] [Analysis] Remove unused function stripIntegerCast The last use was removed by: commit d5b840131223f2ffef4e48ca769ad1eb7bb1869a Author: Philip Reames Date: Thu May 11 08:10:49 2023 -0700 --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 2 -- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 7 ------- 2 files changed, 9 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 0258285746d92..a45f117249e69 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -701,8 +701,6 @@ class LoopAccessInfo { DenseMap SymbolicStrides; }; -Value *stripIntegerCast(Value *V); - /// Return the SCEV corresponding to a pointer with the symbolic stride /// replaced with constant one, assuming the SCEV predicate associated with /// \p PSE is true. diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 5a0b1abe96d9c..6c271a8b2f7c5 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -146,13 +146,6 @@ bool VectorizerParams::isInterleaveForced() { return ::VectorizationInterleave.getNumOccurrences() > 0; } -Value *llvm::stripIntegerCast(Value *V) { - if (auto *CI = dyn_cast(V)) - if (CI->getOperand(0)->getType()->isIntegerTy()) - return CI->getOperand(0); - return V; -} - const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap &PtrToStride, Value *Ptr) { From 1ee839d0ce13820955749a6d0e2506c4ea55d030 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 22 May 2023 12:47:00 -0700 Subject: [PATCH 028/704] [libc++] Use the new .gen tests to implement __verbose_abort tests This reduces the amount of boilerplate that we need to generate for each commit. It also resolves a problem where the modular CI would run extremely slow on this test because we'd define a macro before including the standard library, defeating the module cache. Differential Revision: https://reviews.llvm.org/D151156 --- .../headers_declare_verbose_abort.gen.py | 35 + .../headers_declare_verbose_abort.sh.cpp | 755 ------------------ libcxx/utils/generate_header_tests.py | 118 +-- .../utils/libcxx/test/header_information.py | 113 +++ 4 files changed, 157 insertions(+), 864 deletions(-) create mode 100644 libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py delete mode 100644 libcxx/test/libcxx/assertions/headers_declare_verbose_abort.sh.cpp create mode 100644 libcxx/utils/libcxx/test/header_information.py diff --git a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py new file mode 100644 index 0000000000000..2bf9b66f1be94 --- /dev/null +++ b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.gen.py @@ -0,0 +1,35 @@ +#===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===----------------------------------------------------------------------===## + +# Test that all public C++ headers define the verbose termination function, which +# is required for users to be able to include any public header and then override +# the function using a strong definition. + +# RUN: %{python} %s %{libcxx}/utils + +import sys +sys.path.append(sys.argv[1]) +from libcxx.test.header_information import header_restrictions, public_headers + +for header in public_headers: + # Skip C compatibility headers. + if header.endswith('.h'): + continue + + test_condition_begin = f'#if {header_restrictions[header]}' if header in header_restrictions else '' + test_condition_end = '#endif' if header in header_restrictions else '' + XFAIL = 'XFAIL' # Make sure Lit doesn't think we are XFAILing this test + print(f"""\ +//--- {header}.compile.pass.cpp +// {XFAIL}: availability-verbose_abort-missing +#include <__config> +{test_condition_begin} +#include <{header}> +using HandlerType = decltype(std::__libcpp_verbose_abort); +{test_condition_end} +""") diff --git a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.sh.cpp b/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.sh.cpp deleted file mode 100644 index 5587e21a131ad..0000000000000 --- a/libcxx/test/libcxx/assertions/headers_declare_verbose_abort.sh.cpp +++ /dev/null @@ -1,755 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// Test that all public C++ headers define the verbose termination function, which -// is required for users to be able to include any public header and then override -// the function using a strong definition. - -// XFAIL: availability-verbose_abort-missing - -/* -BEGIN-SCRIPT - -for i, header in enumerate(public_headers): - # Skip C compatibility headers. - if header.endswith('.h'): - continue - - vars = { - 'run': 'RUN', - 'i': i, - 'restrictions': ' && ' + header_restrictions[header] if header in header_restrictions else '', - 'header': header - } - - print("""\ -// {run}: %{{build}} -DTEST_{i} -#if defined(TEST_{i}){restrictions} -# include <{header}> - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif -""".format(**vars)) - -END-SCRIPT -*/ - -#include <__config> - -int main(int, char**) { return 0; } - -// DO NOT MANUALLY EDIT ANYTHING BETWEEN THE MARKERS BELOW -// GENERATED-MARKER -// RUN: %{build} -DTEST_0 -#if defined(TEST_0) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_1 -#if defined(TEST_1) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_2 -#if defined(TEST_2) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_3 -#if defined(TEST_3) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_4 -#if defined(TEST_4) && !defined(_LIBCPP_HAS_NO_THREADS) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_5 -#if defined(TEST_5) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_6 -#if defined(TEST_6) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_7 -#if defined(TEST_7) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_8 -#if defined(TEST_8) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_9 -#if defined(TEST_9) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_10 -#if defined(TEST_10) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_11 -#if defined(TEST_11) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_12 -#if defined(TEST_12) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_13 -#if defined(TEST_13) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_14 -#if defined(TEST_14) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_15 -#if defined(TEST_15) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_16 -#if defined(TEST_16) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_17 -#if defined(TEST_17) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_18 -#if defined(TEST_18) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_19 -#if defined(TEST_19) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_20 -#if defined(TEST_20) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_21 -#if defined(TEST_21) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_22 -#if defined(TEST_22) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_24 -#if defined(TEST_24) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_25 -#if defined(TEST_25) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_26 -#if defined(TEST_26) && (defined(__cpp_impl_coroutine) && __cpp_impl_coroutine >= 201902L) || (defined(__cpp_coroutines) && __cpp_coroutines >= 201703L) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_27 -#if defined(TEST_27) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_28 -#if defined(TEST_28) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_29 -#if defined(TEST_29) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_30 -#if defined(TEST_30) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_31 -#if defined(TEST_31) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_32 -#if defined(TEST_32) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_33 -#if defined(TEST_33) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_34 -#if defined(TEST_34) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_35 -#if defined(TEST_35) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_36 -#if defined(TEST_36) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_37 -#if defined(TEST_37) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_39 -#if defined(TEST_39) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_40 -#if defined(TEST_40) && !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_41 -#if defined(TEST_41) && !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_42 -#if defined(TEST_42) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_44 -#if defined(TEST_44) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_45 -#if defined(TEST_45) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_46 -#if defined(TEST_46) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_48 -#if defined(TEST_48) && !defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_50 -#if defined(TEST_50) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_51 -#if defined(TEST_51) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_52 -#if defined(TEST_52) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) && !defined(_LIBCPP_HAS_NO_FSTREAM) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_53 -#if defined(TEST_53) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_54 -#if defined(TEST_54) && !defined(_LIBCPP_HAS_NO_THREADS) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_55 -#if defined(TEST_55) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_57 -#if defined(TEST_57) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_58 -#if defined(TEST_58) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_59 -#if defined(TEST_59) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_60 -#if defined(TEST_60) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_61 -#if defined(TEST_61) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_62 -#if defined(TEST_62) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_63 -#if defined(TEST_63) && !defined(_LIBCPP_HAS_NO_THREADS) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_64 -#if defined(TEST_64) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_66 -#if defined(TEST_66) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_67 -#if defined(TEST_67) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_69 -#if defined(TEST_69) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_71 -#if defined(TEST_71) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_72 -#if defined(TEST_72) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_73 -#if defined(TEST_73) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_74 -#if defined(TEST_74) && !defined(_LIBCPP_HAS_NO_THREADS) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_75 -#if defined(TEST_75) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_76 -#if defined(TEST_76) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_77 -#if defined(TEST_77) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_78 -#if defined(TEST_78) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_79 -#if defined(TEST_79) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_80 -#if defined(TEST_80) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_81 -#if defined(TEST_81) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_82 -#if defined(TEST_82) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_83 -#if defined(TEST_83) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_84 -#if defined(TEST_84) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_85 -#if defined(TEST_85) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_86 -#if defined(TEST_86) && !defined(_LIBCPP_HAS_NO_THREADS) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_87 -#if defined(TEST_87) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_89 -#if defined(TEST_89) && !defined(_LIBCPP_HAS_NO_THREADS) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_90 -#if defined(TEST_90) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_91 -#if defined(TEST_91) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_92 -#if defined(TEST_92) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_93 -#if defined(TEST_93) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_97 -#if defined(TEST_97) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_101 -#if defined(TEST_101) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_102 -#if defined(TEST_102) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_104 -#if defined(TEST_104) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_105 -#if defined(TEST_105) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_106 -#if defined(TEST_106) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_108 -#if defined(TEST_108) && !defined(_LIBCPP_HAS_NO_THREADS) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_109 -#if defined(TEST_109) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_110 -#if defined(TEST_110) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_111 -#if defined(TEST_111) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_112 -#if defined(TEST_112) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_114 -#if defined(TEST_114) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_115 -#if defined(TEST_115) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_116 -#if defined(TEST_116) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_117 -#if defined(TEST_117) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_118 -#if defined(TEST_118) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_119 -#if defined(TEST_119) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_120 -#if defined(TEST_120) -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_123 -#if defined(TEST_123) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_124 -#if defined(TEST_124) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_125 -#if defined(TEST_125) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_126 -#if defined(TEST_126) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_127 -#if defined(TEST_127) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_128 -#if defined(TEST_128) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_129 -#if defined(TEST_129) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_130 -#if defined(TEST_130) && !defined(_LIBCPP_HAS_NO_LOCALIZATION) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_131 -#if defined(TEST_131) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_132 -#if defined(TEST_132) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_133 -#if defined(TEST_133) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_134 -#if defined(TEST_134) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_135 -#if defined(TEST_135) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_136 -#if defined(TEST_136) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_137 -#if defined(TEST_137) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// RUN: %{build} -DTEST_138 -#if defined(TEST_138) && __cplusplus >= 201103L -# include - using HandlerType = decltype(std::__libcpp_verbose_abort); -#endif - -// GENERATED-MARKER diff --git a/libcxx/utils/generate_header_tests.py b/libcxx/utils/generate_header_tests.py index 905e7a4cec4f4..439cf4862a249 100755 --- a/libcxx/utils/generate_header_tests.py +++ b/libcxx/utils/generate_header_tests.py @@ -7,68 +7,7 @@ import pathlib import re -header_restrictions = { - "barrier": "!defined(_LIBCPP_HAS_NO_THREADS)", - "future": "!defined(_LIBCPP_HAS_NO_THREADS)", - "latch": "!defined(_LIBCPP_HAS_NO_THREADS)", - "mutex": "!defined(_LIBCPP_HAS_NO_THREADS)", - "semaphore": "!defined(_LIBCPP_HAS_NO_THREADS)", - "shared_mutex": "!defined(_LIBCPP_HAS_NO_THREADS)", - "stdatomic.h": "__cplusplus > 202002L && !defined(_LIBCPP_HAS_NO_THREADS)", - "thread": "!defined(_LIBCPP_HAS_NO_THREADS)", - "filesystem": "!defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)", - # TODO(LLVM-17): simplify this to __cplusplus >= 202002L - "coroutine": "(defined(__cpp_impl_coroutine) && __cpp_impl_coroutine >= 201902L) || (defined(__cpp_coroutines) && __cpp_coroutines >= 201703L)", - "clocale": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "codecvt": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "fstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION) && !defined(_LIBCPP_HAS_NO_FSTREAM)", - "iomanip": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "ios": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "iostream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "istream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "locale.h": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "locale": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "ostream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "regex": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "sstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "streambuf": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "strstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - "wctype.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", - "cwctype": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", - "cwchar": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", - "wchar.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", - "experimental/algorithm": "__cplusplus >= 201103L", - "experimental/deque": "__cplusplus >= 201103L", - "experimental/forward_list": "__cplusplus >= 201103L", - "experimental/functional": "__cplusplus >= 201103L", - "experimental/iterator": "__cplusplus >= 201103L", - "experimental/list": "__cplusplus >= 201103L", - "experimental/map": "__cplusplus >= 201103L", - "experimental/memory_resource": "__cplusplus >= 201103L", - "experimental/propagate_const": "__cplusplus >= 201103L", - "experimental/regex": "!defined(_LIBCPP_HAS_NO_LOCALIZATION) && __cplusplus >= 201103L", - "experimental/set": "__cplusplus >= 201103L", - "experimental/simd": "__cplusplus >= 201103L", - "experimental/span": "__cplusplus >= 201103L", - "experimental/string": "__cplusplus >= 201103L", - "experimental/type_traits": "__cplusplus >= 201103L", - "experimental/unordered_map": "__cplusplus >= 201103L", - "experimental/unordered_set": "__cplusplus >= 201103L", - "experimental/utility": "__cplusplus >= 201103L", - "experimental/vector": "__cplusplus >= 201103L", -} - -private_headers_still_public_in_modules = [ - "__assert", - "__config", - "__config_site.in", - "__debug", - "__hash_table", - "__threading_support", - "__tree", - "__undef_macros", - "__verbose_abort", -] +import libcxx.test.header_information def find_script(file): @@ -131,60 +70,21 @@ def produce(test_file, variables): f.write(new_content) -def is_header(file): - """Returns whether the given file is a header (i.e. not a directory or the modulemap file).""" - return ( - not file.is_dir() - and not file.name == "module.modulemap.in" - and file.name != "libcxx.imp" - ) - - def main(): monorepo_root = pathlib.Path( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) ) - include = pathlib.Path(os.path.join(monorepo_root, "libcxx", "include")) test = pathlib.Path(os.path.join(monorepo_root, "libcxx", "test")) assert monorepo_root.exists() - toplevel_headers = sorted( - str(p.relative_to(include)) for p in include.glob("[a-z]*") if is_header(p) - ) - experimental_headers = sorted( - str(p.relative_to(include)) - for p in include.glob("experimental/[a-z]*") - if is_header(p) - ) - public_headers = toplevel_headers + experimental_headers - private_headers = sorted( - str(p.relative_to(include)) - for p in include.rglob("*") - if is_header(p) - and str(p.relative_to(include)).startswith("__") - and not p.name.startswith("pstl") - ) - variables = { - "toplevel_headers": toplevel_headers, - "experimental_headers": experimental_headers, - "public_headers": public_headers, - "private_headers": private_headers, - "header_restrictions": header_restrictions, - "private_headers_still_public_in_modules": private_headers_still_public_in_modules, - } - - produce( - test.joinpath("libcxx/assertions/headers_declare_verbose_abort.sh.cpp"), - variables, - ) - produce(test.joinpath("libcxx/clang_tidy.sh.cpp"), variables) - produce(test.joinpath("libcxx/double_include.sh.cpp"), variables) - produce(test.joinpath("libcxx/min_max_macros.compile.pass.cpp"), variables) - produce(test.joinpath("libcxx/modules_include.sh.cpp"), variables) - produce(test.joinpath("libcxx/nasty_macros.compile.pass.cpp"), variables) - produce(test.joinpath("libcxx/no_assert_include.compile.pass.cpp"), variables) - produce(test.joinpath("libcxx/private_headers.verify.cpp"), variables) - produce(test.joinpath("libcxx/transitive_includes.sh.cpp"), variables) + produce(test.joinpath("libcxx/clang_tidy.sh.cpp"), libcxx.test.header_information.variables) + produce(test.joinpath("libcxx/double_include.sh.cpp"), libcxx.test.header_information.variables) + produce(test.joinpath("libcxx/min_max_macros.compile.pass.cpp"), libcxx.test.header_information.variables) + produce(test.joinpath("libcxx/modules_include.sh.cpp"), libcxx.test.header_information.variables) + produce(test.joinpath("libcxx/nasty_macros.compile.pass.cpp"), libcxx.test.header_information.variables) + produce(test.joinpath("libcxx/no_assert_include.compile.pass.cpp"), libcxx.test.header_information.variables) + produce(test.joinpath("libcxx/private_headers.verify.cpp"), libcxx.test.header_information.variables) + produce(test.joinpath("libcxx/transitive_includes.sh.cpp"), libcxx.test.header_information.variables) if __name__ == "__main__": diff --git a/libcxx/utils/libcxx/test/header_information.py b/libcxx/utils/libcxx/test/header_information.py new file mode 100644 index 0000000000000..1f4bb34248566 --- /dev/null +++ b/libcxx/utils/libcxx/test/header_information.py @@ -0,0 +1,113 @@ +# ===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ===----------------------------------------------------------------------===## + +import os, pathlib + +header_restrictions = { + "barrier": "!defined(_LIBCPP_HAS_NO_THREADS)", + "future": "!defined(_LIBCPP_HAS_NO_THREADS)", + "latch": "!defined(_LIBCPP_HAS_NO_THREADS)", + "mutex": "!defined(_LIBCPP_HAS_NO_THREADS)", + "semaphore": "!defined(_LIBCPP_HAS_NO_THREADS)", + "shared_mutex": "!defined(_LIBCPP_HAS_NO_THREADS)", + "stdatomic.h": "__cplusplus > 202002L && !defined(_LIBCPP_HAS_NO_THREADS)", + "thread": "!defined(_LIBCPP_HAS_NO_THREADS)", + "filesystem": "!defined(_LIBCPP_HAS_NO_FILESYSTEM_LIBRARY)", + # TODO(LLVM-17): simplify this to __cplusplus >= 202002L + "coroutine": "(defined(__cpp_impl_coroutine) && __cpp_impl_coroutine >= 201902L) || (defined(__cpp_coroutines) && __cpp_coroutines >= 201703L)", + "clocale": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "codecvt": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "fstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION) && !defined(_LIBCPP_HAS_NO_FSTREAM)", + "iomanip": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "ios": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "iostream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "istream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "locale.h": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "locale": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "ostream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "regex": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "sstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "streambuf": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "strstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", + "wctype.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", + "cwctype": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", + "cwchar": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", + "wchar.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", + "experimental/algorithm": "__cplusplus >= 201103L", + "experimental/deque": "__cplusplus >= 201103L", + "experimental/forward_list": "__cplusplus >= 201103L", + "experimental/functional": "__cplusplus >= 201103L", + "experimental/iterator": "__cplusplus >= 201103L", + "experimental/list": "__cplusplus >= 201103L", + "experimental/map": "__cplusplus >= 201103L", + "experimental/memory_resource": "__cplusplus >= 201103L", + "experimental/propagate_const": "__cplusplus >= 201103L", + "experimental/regex": "!defined(_LIBCPP_HAS_NO_LOCALIZATION) && __cplusplus >= 201103L", + "experimental/set": "__cplusplus >= 201103L", + "experimental/simd": "__cplusplus >= 201103L", + "experimental/span": "__cplusplus >= 201103L", + "experimental/string": "__cplusplus >= 201103L", + "experimental/type_traits": "__cplusplus >= 201103L", + "experimental/unordered_map": "__cplusplus >= 201103L", + "experimental/unordered_set": "__cplusplus >= 201103L", + "experimental/utility": "__cplusplus >= 201103L", + "experimental/vector": "__cplusplus >= 201103L", +} + +private_headers_still_public_in_modules = [ + "__assert", + "__config", + "__config_site.in", + "__debug", + "__hash_table", + "__threading_support", + "__tree", + "__undef_macros", + "__verbose_abort", +] + +def is_header(file): + """Returns whether the given file is a header (i.e. not a directory or the modulemap file).""" + return ( + not file.is_dir() + and not file.name == "module.modulemap.in" + and not file.name == "CMakeLists.txt" + and file.name != "libcxx.imp" + ) + +monorepo_root = pathlib.Path( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))) +) +include = pathlib.Path(os.path.join(monorepo_root, "libcxx", "include")) +test = pathlib.Path(os.path.join(monorepo_root, "libcxx", "test")) +assert monorepo_root.exists() + +toplevel_headers = sorted( + str(p.relative_to(include)) for p in include.glob("[a-z]*") if is_header(p) +) +experimental_headers = sorted( + str(p.relative_to(include)) + for p in include.glob("experimental/[a-z]*") + if is_header(p) +) +public_headers = toplevel_headers + experimental_headers +private_headers = sorted( + str(p.relative_to(include)) + for p in include.rglob("*") + if is_header(p) + and str(p.relative_to(include)).startswith("__") + and not p.name.startswith("pstl") +) +variables = { + "toplevel_headers": toplevel_headers, + "experimental_headers": experimental_headers, + "public_headers": public_headers, + "private_headers": private_headers, + "header_restrictions": header_restrictions, + "private_headers_still_public_in_modules": private_headers_still_public_in_modules, +} From 725f0be8aec37561b8f2237c7f089742142b0e16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 29 May 2023 22:49:34 +0300 Subject: [PATCH 029/704] [compiler-rt] [test] [asan] Fix Windows tests in i386 mode after StripFunctionName() improvements This applies the same fix as a32a16311050fbccc03638b197910dc1415f60ab to a testcase which isn't executed in x86_64 mode. --- .../test/asan/TestCases/Windows/dll_intercept_memchr.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_intercept_memchr.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_intercept_memchr.cpp index c41d937d32404..6d1894fb862e6 100644 --- a/compiler-rt/test/asan/TestCases/Windows/dll_intercept_memchr.cpp +++ b/compiler-rt/test/asan/TestCases/Windows/dll_intercept_memchr.cpp @@ -17,7 +17,7 @@ int test_function() { memchr(buff, 'z', 7); // CHECK: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] // CHECK: READ of size 7 at [[ADDR]] thread T0 -// CHECK-NEXT: __asan_wrap_memchr +// CHECK-NEXT: memchr // CHECK-NEXT: memchr // CHECK-NEXT: test_function {{.*}}dll_intercept_memchr.cpp:[[@LINE-5]] // CHECK: Address [[ADDR]] is located in stack of thread T0 at offset {{.*}} in frame From 40af06ccc7bf9989b8deb410ed78a4795c22aa73 Mon Sep 17 00:00:00 2001 From: Roy Jacobson Date: Mon, 29 May 2023 22:57:39 +0300 Subject: [PATCH 030/704] [clang-tidy] Update UnusedReturnValueCheck types Update the types for UnusedReturnValueCheck after D151383. 1. Add std::errc, std:error_condition 2. Remove `absl::Status` - it's marked as `[[nodiscard]]` anyway so it's redundant (and might create double warnings) to check it. Reviewed By: PiotrZSL Differential Revision: https://reviews.llvm.org/D151650 --- .../clang-tidy/bugprone/UnusedReturnValueCheck.cpp | 5 +++-- .../docs/clang-tidy/checks/bugprone/unused-return-value.rst | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp index f8139381d7e01..bdc601c2445f5 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp @@ -127,9 +127,10 @@ UnusedReturnValueCheck::UnusedReturnValueCheck(llvm::StringRef Name, "::ttyname")), CheckedReturnTypes(utils::options::parseStringList( Options.get("CheckedReturnTypes", "::std::error_code;" + "::std::error_condition;" + "::std::errc;" "::std::expected;" - "::boost::system::error_code;" - "::abseil::Status"))) {} + "::boost::system::error_code"))) {} void UnusedReturnValueCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "CheckedFunctions", CheckedFunctions); diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst index 89c781b0fe714..c9b24dbcbb94f 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst @@ -50,7 +50,7 @@ Options Semicolon-separated list of function return types to check. By default the following function return types are checked: - `::std::error_code`, `::std::expected`, `::boost::system::error_code`, `::abseil::Status` + `::std::error_code`, `::std::error_condition`, `::std::errc`, `::std::expected`, `::boost::system::error_code` `cert-err33-c <../cert/err33-c.html>`_ is an alias of this check that checks a fixed and large set of standard library functions. From 0332f2c551e5d53adf077f8b523a7f337d881889 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 29 May 2023 21:04:59 +0100 Subject: [PATCH 031/704] [ConstraintElim] Add extra tests for and operand implying other op. Extend test coverage for and conditions where the first operand implies the second. The update includes the select form of AND and a few cases where one of the AND operands gets simplified before solving. --- .../and-implied-by-operands.ll | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll index b14ea94f0d71c..2f684c555e69f 100644 --- a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll +++ b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll @@ -26,6 +26,31 @@ else: ret i1 1 } +define i1 @test_second_and_condition_implied_by_first_select_form(i8 %x) { +; CHECK-LABEL: @test_second_and_condition_implied_by_first_select_form( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 +; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i8 [[X]], 5 +; CHECK-NEXT: [[AND:%.*]] = select i1 [[C_1]], i1 [[T_1]], i1 false +; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: ret i1 false +; CHECK: else: +; CHECK-NEXT: ret i1 true +; +entry: + %c.1 = icmp ugt i8 %x, 10 + %t.1 = icmp ugt i8 %x, 5 + %and = select i1 %c.1, i1 %t.1, i1 false + br i1 %and, label %then, label %else + +then: + ret i1 0 + +else: + ret i1 1 +} + define i1 @test_same_cond_for_and(i8 %x) { ; CHECK-LABEL: @test_same_cond_for_and( ; CHECK-NEXT: entry: @@ -49,6 +74,29 @@ else: ret i1 1 } +define i1 @test_same_cond_for_and_select_form(i8 %x) { +; CHECK-LABEL: @test_same_cond_for_and_select_form( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i8 [[X:%.*]], 10 +; CHECK-NEXT: [[AND:%.*]] = select i1 [[C_1]], i1 [[C_1]], i1 false +; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: ret i1 false +; CHECK: else: +; CHECK-NEXT: ret i1 true +; +entry: + %c.1 = icmp ugt i8 %x, 10 + %and = select i1 %c.1, i1 %c.1, i1 false + br i1 %and, label %then, label %else + +then: + ret i1 0 + +else: + ret i1 1 +} + define i1 @test_second_and_condition_not_implied_by_first(i8 %x) { ; CHECK-LABEL: @test_second_and_condition_not_implied_by_first( ; CHECK-NEXT: entry: @@ -73,3 +121,74 @@ then: else: ret i1 1 } + +define i1 @test_remove_variables(i1 %c, ptr %A, i64 %B, ptr %C) { +; CHECK-LABEL: @test_remove_variables( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN_1:%.*]], label [[EXIT:%.*]] +; CHECK: then.1: +; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C:%.*]], align 8 +; CHECK-NEXT: [[C_1:%.*]] = icmp ult ptr [[TMP0]], [[A:%.*]] +; CHECK-NEXT: br i1 [[C_1]], label [[THEN_2:%.*]], label [[ELSE_2:%.*]] +; CHECK: then.2: +; CHECK-NEXT: [[C_2:%.*]] = icmp ne ptr [[A]], null +; CHECK-NEXT: [[C_3:%.*]] = icmp sgt i64 [[B:%.*]], 0 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[C_2]], [[C_3]] +; CHECK-NEXT: ret i1 [[AND]] +; CHECK: else.2: +; CHECK-NEXT: ret i1 false +; CHECK: exit: +; CHECK-NEXT: [[T:%.*]] = icmp eq ptr null, null +; CHECK-NEXT: ret i1 true +; +entry: + br i1 %c, label %then.1, label %exit + +then.1: + %0 = load ptr, ptr %C, align 8 + %c.1 = icmp ult ptr %0, %A + br i1 %c.1, label %then.2, label %else.2 + +then.2: + %c.2 = icmp ne ptr %A, null + %c.3 = icmp sgt i64 %B, 0 + %and = and i1 %c.2, %c.3 + ret i1 %and + +else.2: + ret i1 0 + +exit: + %t = icmp eq ptr null, null + ret i1 %t +} + +define i1 @test_and_op_0_simplified(i32 %v) { +; CHECK-LABEL: @test_and_op_0_simplified( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp sgt i32 [[V:%.*]], 0 +; CHECK-NEXT: [[T_1:%.*]] = icmp sgt i32 0, 0 +; CHECK-NEXT: [[AND:%.*]] = and i1 false, [[C_1]] +; CHECK-NEXT: ret i1 [[AND]] +; +entry: + %c.1 = icmp sgt i32 %v, 0 + %t.1 = icmp sgt i32 0, 0 + %and = and i1 %t.1, %c.1 + ret i1 %and +} + +define i1 @test_and_op_1_simplified(i32 %v) { +; CHECK-LABEL: @test_and_op_1_simplified( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp sgt i32 [[V:%.*]], 0 +; CHECK-NEXT: [[T_1:%.*]] = icmp sgt i32 0, 0 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[C_1]], false +; CHECK-NEXT: ret i1 [[AND]] +; +entry: + %c.1 = icmp sgt i32 %v, 0 + %t.1 = icmp sgt i32 0, 0 + %and = and i1 %c.1, %t.1 + ret i1 %and +} From 9bb34ca652b648c8d7ebfb183653f2ad2c66e5c4 Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Mon, 29 May 2023 21:38:26 +0100 Subject: [PATCH 032/704] [RISCV][test] Expand bfloat.ll tests to include i16 bitcasts and load/store Pre-commit new tests used in D151663. --- llvm/test/CodeGen/RISCV/bfloat.ll | 100 ++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/bfloat.ll b/llvm/test/CodeGen/RISCV/bfloat.ll index e7583a595ff06..ae05425d9104f 100644 --- a/llvm/test/CodeGen/RISCV/bfloat.ll +++ b/llvm/test/CodeGen/RISCV/bfloat.ll @@ -87,6 +87,30 @@ define double @bfloat_to_double(bfloat %a) nounwind { ret double %1 } +define bfloat @i16_to_bfloat(i16 %a) nounwind { +; RV32I-ILP32-LABEL: i16_to_bfloat: +; RV32I-ILP32: # %bb.0: +; RV32I-ILP32-NEXT: ret +; +; RV64I-LP64-LABEL: i16_to_bfloat: +; RV64I-LP64: # %bb.0: +; RV64I-LP64-NEXT: ret + %1 = bitcast i16 %a to bfloat + ret bfloat %1 +} + +define i16 @bfloat_to_i16(bfloat %a) nounwind { +; RV32I-ILP32-LABEL: bfloat_to_i16: +; RV32I-ILP32: # %bb.0: +; RV32I-ILP32-NEXT: ret +; +; RV64I-LP64-LABEL: bfloat_to_i16: +; RV64I-LP64: # %bb.0: +; RV64I-LP64-NEXT: ret + %1 = bitcast bfloat %a to i16 + ret i16 %1 +} + define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind { ; RV32I-ILP32-LABEL: bfloat_add: ; RV32I-ILP32: # %bb.0: @@ -114,3 +138,79 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind { %1 = fadd bfloat %a, %b ret bfloat %1 } + +define bfloat @bfloat_load(ptr %a) nounwind { +; RV32I-ILP32-LABEL: bfloat_load: +; RV32I-ILP32: # %bb.0: +; RV32I-ILP32-NEXT: addi sp, sp, -16 +; RV32I-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-ILP32-NEXT: lh a1, 0(a0) +; RV32I-ILP32-NEXT: lh a2, 6(a0) +; RV32I-ILP32-NEXT: slli a0, a1, 16 +; RV32I-ILP32-NEXT: slli a1, a2, 16 +; RV32I-ILP32-NEXT: call __addsf3@plt +; RV32I-ILP32-NEXT: call __truncsfbf2@plt +; RV32I-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-ILP32-NEXT: addi sp, sp, 16 +; RV32I-ILP32-NEXT: ret +; +; RV64I-LP64-LABEL: bfloat_load: +; RV64I-LP64: # %bb.0: +; RV64I-LP64-NEXT: addi sp, sp, -16 +; RV64I-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-LP64-NEXT: lh a1, 0(a0) +; RV64I-LP64-NEXT: lh a2, 6(a0) +; RV64I-LP64-NEXT: slliw a0, a1, 16 +; RV64I-LP64-NEXT: slliw a1, a2, 16 +; RV64I-LP64-NEXT: call __addsf3@plt +; RV64I-LP64-NEXT: call __truncsfbf2@plt +; RV64I-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-LP64-NEXT: addi sp, sp, 16 +; RV64I-LP64-NEXT: ret + %1 = load bfloat, ptr %a + %2 = getelementptr bfloat, ptr %a, i32 3 + %3 = load bfloat, ptr %2 + %4 = fadd bfloat %1, %3 + ret bfloat %4 +} + +define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind { +; RV32I-ILP32-LABEL: bfloat_store: +; RV32I-ILP32: # %bb.0: +; RV32I-ILP32-NEXT: addi sp, sp, -16 +; RV32I-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-ILP32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-ILP32-NEXT: mv s0, a0 +; RV32I-ILP32-NEXT: slli a0, a1, 16 +; RV32I-ILP32-NEXT: slli a1, a2, 16 +; RV32I-ILP32-NEXT: call __addsf3@plt +; RV32I-ILP32-NEXT: call __truncsfbf2@plt +; RV32I-ILP32-NEXT: sh a0, 0(s0) +; RV32I-ILP32-NEXT: sh a0, 16(s0) +; RV32I-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-ILP32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-ILP32-NEXT: addi sp, sp, 16 +; RV32I-ILP32-NEXT: ret +; +; RV64I-LP64-LABEL: bfloat_store: +; RV64I-LP64: # %bb.0: +; RV64I-LP64-NEXT: addi sp, sp, -16 +; RV64I-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-LP64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill +; RV64I-LP64-NEXT: mv s0, a0 +; RV64I-LP64-NEXT: slliw a0, a1, 16 +; RV64I-LP64-NEXT: slliw a1, a2, 16 +; RV64I-LP64-NEXT: call __addsf3@plt +; RV64I-LP64-NEXT: call __truncsfbf2@plt +; RV64I-LP64-NEXT: sh a0, 0(s0) +; RV64I-LP64-NEXT: sh a0, 16(s0) +; RV64I-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-LP64-NEXT: ld s0, 0(sp) # 8-byte Folded Reload +; RV64I-LP64-NEXT: addi sp, sp, 16 +; RV64I-LP64-NEXT: ret + %1 = fadd bfloat %b, %c + store bfloat %1, ptr %a + %2 = getelementptr bfloat, ptr %a, i32 8 + store bfloat %1, ptr %2 + ret void +} From 32fc78c26ff591f627d3a6d4f8e0ab7c2092f1e5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 14:17:01 -0700 Subject: [PATCH 033/704] [cland] Remove unused declaration getCurrentPreamble The corresponding function definition was removed by: commit 2214b9076f1d3a4784820c4479e2417685e5c980 Author: Kadir Cetinkaya Date: Thu Apr 2 10:53:23 2020 +0200 --- clang-tools-extra/clangd/TUScheduler.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp index 126937ff49e90..5fc98eec3a2fd 100644 --- a/clang-tools-extra/clangd/TUScheduler.cpp +++ b/clang-tools-extra/clangd/TUScheduler.cpp @@ -635,10 +635,6 @@ class ASTWorker { std::shared_ptr Preamble, std::vector CIDiags, WantDiagnostics WantDiags); - /// Obtain a preamble reflecting all updates so far. Threadsafe. - /// It may be delivered immediately, or later on the worker thread. - void getCurrentPreamble( - llvm::unique_function)>); /// Returns compile command from the current file inputs. tooling::CompileCommand getCurrentCompileCommand() const; From 5bbb66cae7d19836b0177f453fe6633a491f9590 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 29 May 2023 14:17:03 -0700 Subject: [PATCH 034/704] [clangd] Remove unused variable IsBroadcasting The variable was introduced without a use by: commit 2bebc3d0602b407b3f351e782940959da5808f97 Author: Sam McCall Date: Tue Nov 20 10:56:03 2018 +0000 --- clang-tools-extra/clangd/support/Function.h | 1 - 1 file changed, 1 deletion(-) diff --git a/clang-tools-extra/clangd/support/Function.h b/clang-tools-extra/clangd/support/Function.h index 5437729d91b62..a0bce839a77d8 100644 --- a/clang-tools-extra/clangd/support/Function.h +++ b/clang-tools-extra/clangd/support/Function.h @@ -97,7 +97,6 @@ template class Event { "use a plain type: event values are always passed by const&"); std::recursive_mutex ListenersMu; - bool IsBroadcasting = false; std::vector> Listeners; unsigned ListenerCount = 0; }; From 49614c1dc99df1684edecb622228bc5d37e293d7 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Mon, 29 May 2023 14:24:15 -0700 Subject: [PATCH 035/704] [libc++] Add missing _LIBCPP_HIDE_FROM_ABI macro on constructor --- libcxx/include/vector | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/include/vector b/libcxx/include/vector index 86390fb0576d0..82078fbb248ad 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -481,7 +481,7 @@ public: private: class __destroy_vector { public: - _LIBCPP_CONSTEXPR __destroy_vector(vector& __vec) : __vec_(__vec) {} + _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI __destroy_vector(vector& __vec) : __vec_(__vec) {} _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void operator()() { std::__debug_db_erase_c(std::addressof(__vec_)); From b936816fb305cc16d2d1cd6d424c08c39a681a32 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Mon, 29 May 2023 01:38:44 -0700 Subject: [PATCH 036/704] MLIR/Cuda: Add the appropriate "HINTS" on CMake find_library and mark these REQUIRED The cmake logic to find cuda paths exposes some paths to search for the cuda library, we need to propagate this through the call for find_library. This was already done for cuSparse but not for cuda. Differential Revision: https://reviews.llvm.org/D151645 --- mlir/lib/Dialect/GPU/CMakeLists.txt | 3 +-- mlir/lib/ExecutionEngine/CMakeLists.txt | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index 31790490828f1..ca163338f4237 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -120,8 +120,7 @@ if(MLIR_ENABLE_CUDA_RUNNER) ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ) - find_library(CUDA_DRIVER_LIBRARY cuda) - + find_library(CUDA_DRIVER_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) target_link_libraries(MLIRGPUTransforms PRIVATE MLIRNVVMToLLVMIRTranslation diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index 369fd1b8ca77f..e2f76fa51ba93 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -188,10 +188,10 @@ if(LLVM_ENABLE_PIC) endif() # We need the libcuda.so library. - find_library(CUDA_RUNTIME_LIBRARY cuda) + find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) # We need the libcusparse.so library. - find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) + find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED) add_mlir_library(mlir_cuda_runtime SHARED From f55bac933f0b212ca630839c598510c9981ac2cb Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 29 May 2023 20:24:25 -0400 Subject: [PATCH 037/704] [gn] Port 9f6250f more Follow-up to da54bd230a. * Add dep to _cg and _sema targets only to CodeGen and Sema, like with the other Basic clang_tablegen()s * Make tablegen_headers depend on arm_sme_draft_spec_subject_to_change so that the header gets installed --- llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn | 2 -- llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn | 1 + llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 1 + llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn | 1 + 4 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index ab89fac29c4e6..a8e2eb5779617 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -27,8 +27,6 @@ static_library("Basic") { "//clang/include/clang/Basic:arm_cde_builtins", "//clang/include/clang/Basic:arm_mve_builtins", "//clang/include/clang/Basic:arm_sme_builtins", - "//clang/include/clang/Basic:arm_sme_builtin_cg", - "//clang/include/clang/Basic:arm_sme_sema_rangechecks", "//clang/include/clang/Basic:arm_sve_builtins", "//clang/include/clang/Basic:arm_sve_typeflags", "//clang/include/clang/Basic:diags_tablegen", diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn index cfb9c993d57c8..bdd742620e708 100644 --- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn @@ -4,6 +4,7 @@ static_library("CodeGen") { deps = [ "//clang/include/clang/Basic:arm_cde_builtin_cg", "//clang/include/clang/Basic:arm_mve_builtin_cg", + "//clang/include/clang/Basic:arm_sme_builtin_cg", "//clang/include/clang/Basic:arm_sve_builtin_cg", "//clang/include/clang/Basic:riscv_sifive_vector_builtin_cg", "//clang/include/clang/Basic:riscv_vector_builtin_cg", diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index a8a193876061f..bcfe472f4f351 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -65,6 +65,7 @@ copy("tablegen_headers") { ":arm_fp16", ":arm_mve", ":arm_neon", + ":arm_sme_draft_spec_subject_to_change", ":arm_sve", ":riscv_vector", ] diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn index 24e25cfd32fcc..65cf305ab1e8b 100644 --- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn @@ -13,6 +13,7 @@ static_library("Sema") { "//clang/include/clang/Basic:arm_cde_builtin_sema", "//clang/include/clang/Basic:arm_mve_builtin_aliases", "//clang/include/clang/Basic:arm_mve_builtin_sema", + "//clang/include/clang/Basic:arm_sme_sema_rangechecks", "//clang/include/clang/Basic:arm_sve_sema_rangechecks", "//clang/include/clang/Basic:riscv_sifive_vector_builtin_sema", "//clang/include/clang/Basic:riscv_vector_builtin_sema", From a1e78615fb331484e07c2201433ba1e683348c47 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Mon, 29 May 2023 13:17:28 -0700 Subject: [PATCH 038/704] [mlir][complex] Canonicalize re/im(neg(create)) When can just convert this to arith.negf. Reviewed By: kuhar Differential Revision: https://reviews.llvm.org/D151633 --- .../mlir/Dialect/Complex/IR/ComplexOps.td | 2 + mlir/lib/Dialect/Complex/IR/ComplexOps.cpp | 38 +++++++++++++++++++ mlir/test/Dialect/Complex/canonicalize.mlir | 22 +++++++++++ 3 files changed, 62 insertions(+) diff --git a/mlir/include/mlir/Dialect/Complex/IR/ComplexOps.td b/mlir/include/mlir/Dialect/Complex/IR/ComplexOps.td index 7116bed2763f6..dd7c1a8ca8866 100644 --- a/mlir/include/mlir/Dialect/Complex/IR/ComplexOps.td +++ b/mlir/include/mlir/Dialect/Complex/IR/ComplexOps.td @@ -290,6 +290,7 @@ def ImOp : ComplexUnaryOp<"im", let results = (outs AnyFloat:$imaginary); let hasFolder = 1; + let hasCanonicalizer = 1; } //===----------------------------------------------------------------------===// @@ -436,6 +437,7 @@ def ReOp : ComplexUnaryOp<"re", let results = (outs AnyFloat:$real); let hasFolder = 1; + let hasCanonicalizer = 1; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Complex/IR/ComplexOps.cpp b/mlir/lib/Dialect/Complex/IR/ComplexOps.cpp index f2d1a96fa4a28..f8c9b63f12aa2 100644 --- a/mlir/lib/Dialect/Complex/IR/ComplexOps.cpp +++ b/mlir/lib/Dialect/Complex/IR/ComplexOps.cpp @@ -6,9 +6,12 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Matchers.h" +#include "mlir/IR/PatternMatch.h" using namespace mlir; using namespace mlir::complex; @@ -99,6 +102,36 @@ OpFoldResult ImOp::fold(FoldAdaptor adaptor) { return {}; } +namespace { +template +struct FoldComponentNeg final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(OpKind op, + PatternRewriter &rewriter) const override { + auto negOp = op.getOperand().template getDefiningOp(); + if (!negOp) + return failure(); + + auto createOp = negOp.getComplex().template getDefiningOp(); + if (!createOp) + return failure(); + + Type elementType = createOp.getType().getElementType(); + assert(isa(elementType)); + + rewriter.replaceOpWithNewOp( + op, elementType, createOp.getOperand(ComponentIndex)); + return success(); + } +}; +} // namespace + +void ImOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add>(context); +} + //===----------------------------------------------------------------------===// // ReOp //===----------------------------------------------------------------------===// @@ -113,6 +146,11 @@ OpFoldResult ReOp::fold(FoldAdaptor adaptor) { return {}; } +void ReOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add>(context); +} + //===----------------------------------------------------------------------===// // AddOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Complex/canonicalize.mlir b/mlir/test/Dialect/Complex/canonicalize.mlir index f0d287fde18aa..2fd2002c5cedf 100644 --- a/mlir/test/Dialect/Complex/canonicalize.mlir +++ b/mlir/test/Dialect/Complex/canonicalize.mlir @@ -155,3 +155,25 @@ func.func @complex_sub_zero() -> complex { %sub = complex.sub %complex1, %complex2 : complex return %sub : complex } + +// CHECK-LABEL: func @re_neg +// CHECK-SAME: (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32) +func.func @re_neg(%arg0: f32, %arg1: f32) -> f32 { + %create = complex.create %arg0, %arg1: complex + // CHECK: %[[NEG:.*]] = arith.negf %[[ARG0]] + %neg = complex.neg %create : complex + %re = complex.re %neg : complex + // CHECK-NEXT: return %[[NEG]] + return %re : f32 +} + +// CHECK-LABEL: func @im_neg +// CHECK-SAME: (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32) +func.func @im_neg(%arg0: f32, %arg1: f32) -> f32 { + %create = complex.create %arg0, %arg1: complex + // CHECK: %[[NEG:.*]] = arith.negf %[[ARG1]] + %neg = complex.neg %create : complex + %im = complex.im %neg : complex + // CHECK-NEXT: return %[[NEG]] + return %im : f32 +} From 52aaac635a0f084fbbe4a4209afba3447b2af068 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Mon, 29 May 2023 17:54:22 -0700 Subject: [PATCH 039/704] [mlir][spirv][complex] Support convert complex.constant Reviewed By: kuhar Differential Revision: https://reviews.llvm.org/D151622 --- .../ComplexToSPIRV/ComplexToSPIRV.cpp | 23 +++++++++++++++++-- .../ComplexToSPIRV/complex-to-spirv.mlir | 9 ++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Conversion/ComplexToSPIRV/ComplexToSPIRV.cpp b/mlir/lib/Conversion/ComplexToSPIRV/ComplexToSPIRV.cpp index cb1362dc7f120..d531659e0623a 100644 --- a/mlir/lib/Conversion/ComplexToSPIRV/ComplexToSPIRV.cpp +++ b/mlir/lib/Conversion/ComplexToSPIRV/ComplexToSPIRV.cpp @@ -28,6 +28,25 @@ using namespace mlir; namespace { +struct ConstantOpPattern final : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(complex::ConstantOp constOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto spirvType = + getTypeConverter()->convertType(constOp.getType()); + if (!spirvType) + return rewriter.notifyMatchFailure(constOp, + "unable to convert result type"); + + rewriter.replaceOpWithNewOp( + constOp, spirvType, + DenseElementsAttr::get(spirvType, constOp.getValue().getValue())); + return success(); + } +}; + struct CreateOpPattern final : OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -87,6 +106,6 @@ void mlir::populateComplexToSPIRVPatterns(SPIRVTypeConverter &typeConverter, RewritePatternSet &patterns) { MLIRContext *context = patterns.getContext(); - patterns.add(typeConverter, - context); + patterns.add( + typeConverter, context); } diff --git a/mlir/test/Conversion/ComplexToSPIRV/complex-to-spirv.mlir b/mlir/test/Conversion/ComplexToSPIRV/complex-to-spirv.mlir index 6b56a4488b882..45f38d435c50b 100644 --- a/mlir/test/Conversion/ComplexToSPIRV/complex-to-spirv.mlir +++ b/mlir/test/Conversion/ComplexToSPIRV/complex-to-spirv.mlir @@ -38,3 +38,12 @@ func.func @imaginary_number(%arg: complex) -> f32 { // CHECK: %[[IM:.+]] = spirv.CompositeExtract %[[CAST]][1 : i32] : vector<2xf32> // CHECK: return %[[IM]] : f32 +// ----- + +func.func @complex_const() -> complex { + %cst = complex.constant [0x7FC00000 : f32, 0.000000e+00 : f32] : complex + return %cst : complex +} + +// CHECK-LABEL: func.func @complex_const() +// CHECK: spirv.Constant dense<[0x7FC00000, 0.000000e+00]> : vector<2xf32> From 071e9d7bac7a5c879b1c67e1c4e847814f6d8254 Mon Sep 17 00:00:00 2001 From: Jianjian GUAN Date: Mon, 29 May 2023 17:27:36 +0800 Subject: [PATCH 040/704] [RISCV] Fix unmasked vp_abs select. Make unmasked vp_abs select to umasked instructions. Reviewed By: fakepaper56 Differential Revision: https://reviews.llvm.org/D151646 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 + .../CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll | 112 ++++++------------ 2 files changed, 41 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7bfac465413f4..e853251c254e7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -8048,6 +8048,9 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const { SDValue Mask, VL; if (Op->getOpcode() == ISD::VP_ABS) { Mask = Op->getOperand(1); + if (VT.isFixedLengthVector()) + Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG, + Subtarget); VL = Op->getOperand(2); } else std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll index 0b58eb6ad726e..3940c672ab823 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-abs-vp.ll @@ -20,11 +20,9 @@ define <2 x i8> @vp_abs_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i8> @vp_abs_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v2i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer @@ -48,11 +46,9 @@ define <4 x i8> @vp_abs_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i8> @vp_abs_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v4i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -76,11 +72,9 @@ define <8 x i8> @vp_abs_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i8> @vp_abs_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v8i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -104,11 +98,9 @@ define <16 x i8> @vp_abs_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { define <16 x i8> @vp_abs_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v16i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -132,11 +124,9 @@ define <2 x i16> @vp_abs_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i16> @vp_abs_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v2i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer @@ -160,11 +150,9 @@ define <4 x i16> @vp_abs_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i16> @vp_abs_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v4i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -188,11 +176,9 @@ define <8 x i16> @vp_abs_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i16> @vp_abs_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v8i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -216,11 +202,9 @@ define <16 x i16> @vp_abs_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) define <16 x i16> @vp_abs_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v16i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v10, v0.t +; CHECK-NEXT: vrsub.vi v10, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v10 ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -244,11 +228,9 @@ define <2 x i32> @vp_abs_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i32> @vp_abs_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v2i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer @@ -272,11 +254,9 @@ define <4 x i32> @vp_abs_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i32> @vp_abs_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v4i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -300,11 +280,9 @@ define <8 x i32> @vp_abs_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i32> @vp_abs_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v8i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v10, v0.t +; CHECK-NEXT: vrsub.vi v10, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v10 ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -328,11 +306,9 @@ define <16 x i32> @vp_abs_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) define <16 x i32> @vp_abs_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v16i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vrsub.vi v12, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v12, v0.t +; CHECK-NEXT: vrsub.vi v12, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v12 ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -356,11 +332,9 @@ define <2 x i64> @vp_abs_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i64> @vp_abs_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v2i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vrsub.vi v9, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v9, v0.t +; CHECK-NEXT: vrsub.vi v9, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v9 ; CHECK-NEXT: ret %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer @@ -384,11 +358,9 @@ define <4 x i64> @vp_abs_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i64> @vp_abs_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v4i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vrsub.vi v10, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v10, v0.t +; CHECK-NEXT: vrsub.vi v10, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v10 ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -412,11 +384,9 @@ define <8 x i64> @vp_abs_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i64> @vp_abs_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v8i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vrsub.vi v12, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v12, v0.t +; CHECK-NEXT: vrsub.vi v12, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v12 ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -440,11 +410,9 @@ define <15 x i64> @vp_abs_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) define <15 x i64> @vp_abs_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v15i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vrsub.vi v16, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v16, v0.t +; CHECK-NEXT: vrsub.vi v16, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v16 ; CHECK-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -468,11 +436,9 @@ define <16 x i64> @vp_abs_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) define <16 x i64> @vp_abs_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v16i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vrsub.vi v16, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v16, v0.t +; CHECK-NEXT: vrsub.vi v16, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v16 ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -512,24 +478,22 @@ define <32 x i64> @vp_abs_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) define <32 x i64> @vp_abs_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_abs_v32i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB35_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB35_2: ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vrsub.vi v24, v8, 0, v0.t -; CHECK-NEXT: vmax.vv v8, v8, v24, v0.t +; CHECK-NEXT: vrsub.vi v24, v8, 0 +; CHECK-NEXT: vmax.vv v8, v8, v24 ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vrsub.vi v24, v16, 0, v0.t -; CHECK-NEXT: vmax.vv v16, v16, v24, v0.t +; CHECK-NEXT: vrsub.vi v24, v16, 0 +; CHECK-NEXT: vmax.vv v16, v16, v24 ; CHECK-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer From 9239d3a3eaf278ecf36376760b21e49512de6ac6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 29 May 2023 19:44:43 -0700 Subject: [PATCH 041/704] [RISCV] Teach performCombineVMergeAndVOps to handle instructions FMA instructions. Previously we only handled instructions with merge ops that were also masked. This patch supports instructions with merge ops that aren't masked, like FMA. I'm only folding into a TU vmerge for now. Supporting TA vmerge shouldn't be much more work, but we need to make sure we get the policy operand for the result correct. And of course we need more tests. Reviewed By: fakepaper56, frasercrmck Differential Revision: https://reviews.llvm.org/D151596 --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 42 +++++--- llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll | 105 ++++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll | 35 +++++++ 3 files changed, 169 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index e4dd7ec9592ba..8981e4eba49ee 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3249,18 +3249,40 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N, bool IsTA) { uint64_t TrueTSFlags = TII->get(TrueOpc).TSFlags; bool HasMergeOp = RISCVII::hasMergeOp(TrueTSFlags); + bool IsMasked = false; + const RISCV::RISCVMaskedPseudoInfo *Info = + RISCV::lookupMaskedIntrinsicByUnmaskedTA(TrueOpc); + if (!Info && HasMergeOp) { + Info = RISCV::getMaskedPseudoInfo(TrueOpc); + IsMasked = true; + } + + if (!Info) + return false; + if (HasMergeOp) { // The vmerge instruction must be TU. + // FIXME: This could be relaxed, but we need to handle the policy for the + // resulting op correctly. if (IsTA) return false; - SDValue MergeOpN = N->getOperand(0); SDValue MergeOpTrue = True->getOperand(0); // Both the vmerge instruction and the True instruction must have the same - // merge operand. The vmerge instruction must have an all 1s mask since - // we're going to keep the mask from the True instruction. + // merge operand. + if (False != MergeOpTrue) + return false; + } + + if (IsMasked) { + assert(HasMergeOp && "Expected merge op"); + // The vmerge instruction must be TU. + if (IsTA) + return false; + // The vmerge instruction must have an all 1s mask since we're going to keep + // the mask from the True instruction. // FIXME: Support mask agnostic True instruction which would have an // undef merge operand. - if (MergeOpN != MergeOpTrue || !usesAllOnesMask(N, /* MaskOpIdx */ 3)) + if (!usesAllOnesMask(N, /* MaskOpIdx */ 3)) return false; } @@ -3269,13 +3291,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N, bool IsTA) { if (TII->get(TrueOpc).hasUnmodeledSideEffects()) return false; - const RISCV::RISCVMaskedPseudoInfo *Info = - HasMergeOp ? RISCV::getMaskedPseudoInfo(TrueOpc) - : RISCV::lookupMaskedIntrinsicByUnmaskedTA(TrueOpc); - - if (!Info) - return false; - // The last operand of a masked instruction may be glued. bool HasGlueOp = True->getGluedNode() != nullptr; @@ -3324,14 +3339,15 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N, bool IsTA) { "Expected instructions with mask have merge operand."); SmallVector Ops; - if (HasMergeOp) { + if (IsMasked) { Ops.append(True->op_begin(), True->op_begin() + TrueVLIndex); Ops.append({VL, /* SEW */ True.getOperand(TrueVLIndex + 1)}); Ops.push_back( CurDAG->getTargetConstant(Policy, DL, Subtarget->getXLenVT())); Ops.append(True->op_begin() + TrueVLIndex + 3, True->op_end()); } else { - Ops.push_back(False); + if (!HasMergeOp) + Ops.push_back(False); Ops.append(True->op_begin(), True->op_begin() + TrueVLIndex); Ops.append({Mask, VL, /* SEW */ True.getOperand(TrueVLIndex + 1)}); Ops.push_back( diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll index 9586f62b4d352..330eb82e13f3b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll @@ -7,6 +7,7 @@ declare @llvm.vp.fma.nxv1f32(, , , , i32) declare @llvm.vp.fneg.nxv1f32(, , i32) declare @llvm.vp.fpext.nxv1f32.nxv1f16(, , i32) +declare @llvm.vp.merge.nxv1f32(, , , i32) define @vfmacc_vv_nxv1f32( %a, %b, %c, %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_nxv1f32: @@ -36,6 +37,56 @@ define @vfmacc_vv_nxv1f32_unmasked( %a, ret %v } +define @vfmacc_vv_nxv1f32_tu( %a, %b, %c, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmacc_vv_nxv1f32_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfwmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %aext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %a, %allones, i32 %evl) + %bext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %b, %allones, i32 %evl) + %v = call @llvm.vp.fma.nxv1f32( %aext, %bext, %c, %allones, i32 %evl) + %u = call @llvm.vp.merge.nxv1f32( %m, %v, %c, i32 %evl) + ret %u +} + +; FIXME: Support this case? +define @vfmacc_vv_nxv1f32_masked__tu( %a, %b, %c, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmacc_vv_nxv1f32_masked__tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vmv1r.v v11, v10 +; CHECK-NEXT: vfwmacc.vv v11, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %aext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %a, %m, i32 %evl) + %bext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %b, %m, i32 %evl) + %v = call @llvm.vp.fma.nxv1f32( %aext, %bext, %c, %m, i32 %evl) + %u = call @llvm.vp.merge.nxv1f32( %m, %v, %c, i32 %evl) + ret %u +} + +define @vfmacc_vv_nxv1f32_unmasked_tu( %a, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: vfmacc_vv_nxv1f32_unmasked_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vfwmacc.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %aext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %a, %allones, i32 %evl) + %bext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %b, %allones, i32 %evl) + %v = call @llvm.vp.fma.nxv1f32( %aext, %bext, %c, %allones, i32 %evl) + %u = call @llvm.vp.merge.nxv1f32( %allones, %v, %c, i32 %evl) + ret %u +} + define @vfmacc_vf_nxv1f32( %va, half %b, %vc, %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_nxv1f32: ; CHECK: # %bb.0: @@ -83,6 +134,60 @@ define @vfmacc_vf_nxv1f32_unmasked( %va, ret %v } +define @vfmacc_vf_nxv1f32_tu( %va, half %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmacc_vf_nxv1f32_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfwmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, half %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %vaext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %va, %allones, i32 %evl) + %vbext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %vb, %allones, i32 %evl) + %v = call @llvm.vp.fma.nxv1f32( %vaext, %vbext, %vc, %allones, i32 %evl) + %u = call @llvm.vp.merge.nxv1f32( %m, %v, %vc, i32 %evl) + ret %u +} + +define @vfmacc_vf_nxv1f32_commute_tu( %va, half %b, %vc, %m, i32 zeroext %evl) { +; CHECK-LABEL: vfmacc_vf_nxv1f32_commute_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfwmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, half %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %vaext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %va, %allones, i32 %evl) + %vbext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %vb, %allones, i32 %evl) + %v = call @llvm.vp.fma.nxv1f32( %vbext, %vaext, %vc, %allones, i32 %evl) + %u = call @llvm.vp.merge.nxv1f32( %m, %v, %vc, i32 %evl) + ret %u +} + +define @vfmacc_vf_nxv1f32_unmasked_tu( %va, half %b, %vc, i32 zeroext %evl) { +; CHECK-LABEL: vfmacc_vf_nxv1f32_unmasked_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vfwmacc.vf v9, fa0, v8 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %elt.head = insertelement poison, half %b, i32 0 + %vb = shufflevector %elt.head, poison, zeroinitializer + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %vaext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %va, %allones, i32 %evl) + %vbext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %vb, %allones, i32 %evl) + %v = call @llvm.vp.fma.nxv1f32( %vaext, %vbext, %vc, %allones, i32 %evl) + %u = call @llvm.vp.merge.nxv1f32( %allones, %v, %vc, i32 %evl) + ret %u +} + declare @llvm.vp.fma.nxv2f32(, , , , i32) declare @llvm.vp.fneg.nxv2f32(, , i32) declare @llvm.vp.merge.nxv2f32(, , , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll index 578caa36855da..b27a1e0f3c50f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfwmsac-vp.ll @@ -7,6 +7,7 @@ declare @llvm.vp.fma.nxv1f32(, , , , i32) declare @llvm.vp.fneg.nxv1f32(, , i32) declare @llvm.vp.fpext.nxv1f32.nxv1f16(, , i32) +declare @llvm.vp.merge.nxv1f32(, , , i32) define @vmfsac_vv_nxv1f32( %a, %b, %c, %m, i32 zeroext %evl) { ; CHECK-LABEL: vmfsac_vv_nxv1f32: @@ -38,6 +39,40 @@ define @vmfsac_vv_nxv1f32_unmasked( %a, ret %v } +define @vmfsac_vv_nxv1f32_tu( %a, %b, %c, %m, i32 zeroext %evl) { +; CHECK-LABEL: vmfsac_vv_nxv1f32_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfwmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %aext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %a, %allones, i32 %evl) + %bext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %b, %allones, i32 %evl) + %negc = call @llvm.vp.fneg.nxv1f32( %c, %allones, i32 %evl) + %v = call @llvm.vp.fma.nxv1f32( %aext, %bext, %negc, %allones, i32 %evl) + %u = call @llvm.vp.merge.nxv1f32( %m, %v, %c, i32 %evl) + ret %u +} + +define @vmfsac_vv_nxv1f32_unmasked_tu( %a, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: vmfsac_vv_nxv1f32_unmasked_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vfwmsac.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %aext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %a, %allones, i32 %evl) + %bext = call @llvm.vp.fpext.nxv1f32.nxv1f16( %b, %allones, i32 %evl) + %negc = call @llvm.vp.fneg.nxv1f32( %c, %allones, i32 %evl) + %v = call @llvm.vp.fma.nxv1f32( %aext, %bext, %negc, %allones, i32 %evl) + %u = call @llvm.vp.merge.nxv1f32( %allones, %v, %c, i32 %evl) + ret %u +} + define @vmfsac_vf_nxv1f32( %a, half %b, %c, %m, i32 zeroext %evl) { ; CHECK-LABEL: vmfsac_vf_nxv1f32: ; CHECK: # %bb.0: From 8f386ff69ab8e012c1716ae05e70fd5288435835 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Tue, 30 May 2023 11:20:41 +0800 Subject: [PATCH 042/704] [ms-inline asm] Add error check for `getAsInteger` .Imm can get lexed as a real, but a real doesn't equal to .Imm, e.g., 2.5 or .123e+8. We should report error for it rather than silently ignore. Reviewed By: skan Differential Revision: https://reviews.llvm.org/D151652 --- llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 54d297bd58720..8c6ae1d1611aa 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2315,7 +2315,8 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, // .Imm gets lexed as a real. if (Tok.is(AsmToken::Real)) { APInt DotDisp; - DotDispStr.getAsInteger(10, DotDisp); + if (DotDispStr.getAsInteger(10, DotDisp)) + return Error(Tok.getLoc(), "Unexpected offset"); Info.Offset = DotDisp.getZExtValue(); } else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) && Tok.is(AsmToken::Identifier)) { From 944773436ab1e89d624b3207cf67ea1971b5d17e Mon Sep 17 00:00:00 2001 From: Jianjian GUAN Date: Tue, 30 May 2023 11:03:26 +0800 Subject: [PATCH 043/704] [RISCV][NFC] Fix unmasked test for vp_cttz and vp_ctlz. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D151673 --- llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll | 2109 ++++++++--------- llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll | 1473 ++++++------ .../RISCV/rvv/fixed-vectors-ctlz-vp.ll | 1731 +++++++------- .../RISCV/rvv/fixed-vectors-cttz-vp.ll | 1191 +++++----- 4 files changed, 3026 insertions(+), 3478 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll index ffcd512e70449..eb914e03b54cb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll @@ -37,30 +37,28 @@ define @vp_ctlz_nxv1i8( %va, @vp_ctlz_nxv1i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_nxv1i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv1i8( %va, i1 false, %m, i32 %evl) ret %v @@ -99,30 +97,28 @@ define @vp_ctlz_nxv2i8( %va, @vp_ctlz_nxv2i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_nxv2i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv2i8( %va, i1 false, %m, i32 %evl) ret %v @@ -161,30 +157,28 @@ define @vp_ctlz_nxv4i8( %va, @vp_ctlz_nxv4i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_nxv4i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv4i8( %va, i1 false, %m, i32 %evl) ret %v @@ -223,30 +217,28 @@ define @vp_ctlz_nxv8i8( %va, @vp_ctlz_nxv8i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_nxv8i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv8i8( %va, i1 false, %m, i32 %evl) ret %v @@ -285,30 +277,28 @@ define @vp_ctlz_nxv16i8( %va, @vp_ctlz_nxv16i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_nxv16i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v10, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v10, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v10, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vsrl.vi v10, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v10, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v10, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv16i8( %va, i1 false, %m, i32 %evl) ret %v @@ -347,30 +337,28 @@ define @vp_ctlz_nxv32i8( %va, @vp_ctlz_nxv32i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_nxv32i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v12, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v12, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v12, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vsrl.vi v12, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vsrl.vi v12, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v12, v12, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v12, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v12, v8, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v12, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v12, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv32i8( %va, i1 false, %m, i32 %evl) ret %v @@ -409,30 +397,28 @@ define @vp_ctlz_nxv64i8( %va, @vp_ctlz_nxv64i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_nxv64i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v16, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v16, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v16, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: vsrl.vi v16, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: vsrl.vi v16, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v16, v16, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v16, v0.t +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v16, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v16, v8, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v16, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv64i8( %va, i1 false, %m, i32 %evl) ret %v @@ -513,74 +499,70 @@ define @vp_ctlz_nxv1i16( %va, @vp_ctlz_nxv1i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv1i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a0, 5 +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv1i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv1i16( %va, i1 false, %m, i32 %evl) ret %v @@ -661,74 +643,70 @@ define @vp_ctlz_nxv2i16( %va, @vp_ctlz_nxv2i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv2i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv2i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv2i16( %va, i1 false, %m, i32 %evl) ret %v @@ -809,74 +787,70 @@ define @vp_ctlz_nxv4i16( %va, @vp_ctlz_nxv4i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv4i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv4i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv4i16( %va, i1 false, %m, i32 %evl) ret %v @@ -957,74 +931,70 @@ define @vp_ctlz_nxv8i16( %va, @vp_ctlz_nxv8i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv8i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv8i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v10, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv8i16( %va, i1 false, %m, i32 %evl) ret %v @@ -1105,74 +1075,70 @@ define @vp_ctlz_nxv16i16( %va, @vp_ctlz_nxv16i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv16i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv16i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v12, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v12, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv16i16( %va, i1 false, %m, i32 %evl) ret %v @@ -1253,74 +1219,70 @@ define @vp_ctlz_nxv32i16( %va, @vp_ctlz_nxv32i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv32i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv32i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv32i16( %va, i1 false, %m, i32 %evl) ret %v @@ -1407,80 +1369,76 @@ define @vp_ctlz_nxv1i32( %va, @vp_ctlz_nxv1i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv1i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv1i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv1i32( %va, i1 false, %m, i32 %evl) ret %v @@ -1567,80 +1525,76 @@ define @vp_ctlz_nxv2i32( %va, @vp_ctlz_nxv2i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv2i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv2i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv2i32( %va, i1 false, %m, i32 %evl) ret %v @@ -1727,80 +1681,76 @@ define @vp_ctlz_nxv4i32( %va, @vp_ctlz_nxv4i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv4i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv4i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v10, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv4i32( %va, i1 false, %m, i32 %evl) ret %v @@ -1887,80 +1837,76 @@ define @vp_ctlz_nxv8i32( %va, @vp_ctlz_nxv8i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv8i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv8i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v12, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v12, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv8i32( %va, i1 false, %m, i32 %evl) ret %v @@ -2047,80 +1993,76 @@ define @vp_ctlz_nxv16i32( %va, @vp_ctlz_nxv16i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_nxv16i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv16i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv16i32( %va, i1 false, %m, i32 %evl) ret %v @@ -2241,8 +2183,6 @@ define @vp_ctlz_nxv1i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -2260,91 +2200,89 @@ define @vp_ctlz_nxv1i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v9, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vx v9, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v10, v8, v9 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v8, v8, v9 ; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v9, v0.t +; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv1i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI37_0) ; RV64-NEXT: ld a0, %lo(.LCPI37_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI37_1) ; RV64-NEXT: ld a1, %lo(.LCPI37_1)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: vand.vx v9, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 +; RV64-NEXT: vand.vx v9, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: lui a0, %hi(.LCPI37_2) ; RV64-NEXT: ld a0, %lo(.LCPI37_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI37_3) ; RV64-NEXT: ld a1, %lo(.LCPI37_3)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv1i64( %va, i1 false, %m, i32 %evl) ret %v @@ -2465,8 +2403,6 @@ define @vp_ctlz_nxv2i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -2484,91 +2420,89 @@ define @vp_ctlz_nxv2i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsrl.vx v10, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v10, v0.t +; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv2i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI39_0) ; RV64-NEXT: ld a0, %lo(.LCPI39_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI39_1) ; RV64-NEXT: ld a1, %lo(.LCPI39_1)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: vand.vx v10, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: vand.vx v10, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: lui a0, %hi(.LCPI39_2) ; RV64-NEXT: ld a0, %lo(.LCPI39_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI39_3) ; RV64-NEXT: ld a1, %lo(.LCPI39_3)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv2i64( %va, i1 false, %m, i32 %evl) ret %v @@ -2689,8 +2623,6 @@ define @vp_ctlz_nxv4i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -2708,91 +2640,89 @@ define @vp_ctlz_nxv4i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t +; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv4i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v12, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI41_0) ; RV64-NEXT: ld a0, %lo(.LCPI41_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI41_1) ; RV64-NEXT: ld a1, %lo(.LCPI41_1)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: vand.vx v12, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: lui a0, %hi(.LCPI41_2) ; RV64-NEXT: ld a0, %lo(.LCPI41_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI41_3) ; RV64-NEXT: ld a1, %lo(.LCPI41_3)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv4i64( %va, i1 false, %m, i32 %evl) ret %v @@ -2913,8 +2843,6 @@ define @vp_ctlz_nxv7i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -2932,91 +2860,89 @@ define @vp_ctlz_nxv7i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv7i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI43_0) ; RV64-NEXT: ld a0, %lo(.LCPI43_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI43_1) ; RV64-NEXT: ld a1, %lo(.LCPI43_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: lui a0, %hi(.LCPI43_2) ; RV64-NEXT: ld a0, %lo(.LCPI43_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI43_3) ; RV64-NEXT: ld a1, %lo(.LCPI43_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv7i64( %va, i1 false, %m, i32 %evl) ret %v @@ -3137,8 +3063,6 @@ define @vp_ctlz_nxv8i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -3156,91 +3080,89 @@ define @vp_ctlz_nxv8i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv8i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI45_0) ; RV64-NEXT: ld a0, %lo(.LCPI45_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI45_1) ; RV64-NEXT: ld a1, %lo(.LCPI45_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: lui a0, %hi(.LCPI45_2) ; RV64-NEXT: ld a0, %lo(.LCPI45_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI45_3) ; RV64-NEXT: ld a1, %lo(.LCPI45_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv8i64( %va, i1 false, %m, i32 %evl) ret %v @@ -3581,12 +3503,9 @@ define @vp_ctlz_nxv16i64_unmasked( %va, i ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -3609,92 +3528,42 @@ define @vp_ctlz_nxv16i64_unmasked( %va, i ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a2 ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v16, 2, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v16, 8, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v16, 16, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 1 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vsrl.vi v24, v16, 2 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vsrl.vi v24, v16, 8 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vsrl.vi v24, v16, 16 +; RV32-NEXT: vor.vv v16, v16, v24 ; RV32-NEXT: li a2, 32 -; RV32-NEXT: vsrl.vx v24, v16, a2, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v24, v16, a2 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vnot.v v16, v16 +; RV32-NEXT: vsrl.vi v24, v16, 1 ; RV32-NEXT: addi a4, sp, 8 ; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a4), zero +; RV32-NEXT: vlse64.v v0, (a4), zero ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV32-NEXT: csrr a5, vlenb ; RV32-NEXT: li a6, 24 ; RV32-NEXT: mul a5, a5, a6 ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v16, v16, v24, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v0, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vsub.vv v16, v16, v24 ; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a4), zero +; RV32-NEXT: vlse64.v v0, (a4), zero ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v24, v16, v0.t -; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a4), zero ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma @@ -3703,7 +3572,7 @@ define @vp_ctlz_nxv16i64_unmasked( %va, i ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 ; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v24, v16, v24 ; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a4), zero ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma @@ -3712,9 +3581,9 @@ define @vp_ctlz_nxv16i64_unmasked( %va, i ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vmul.vv v24, v24, v16, v0.t +; RV32-NEXT: vmul.vv v24, v24, v16 ; RV32-NEXT: li a3, 56 -; RV32-NEXT: vsrl.vx v16, v24, a3, v0.t +; RV32-NEXT: vsrl.vx v16, v24, a3 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: bltu a0, a1, .LBB47_2 @@ -3722,138 +3591,130 @@ define @vp_ctlz_nxv16i64_unmasked( %va, i ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: .LBB47_2: ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v24, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a3, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v8, v8, a3 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_nxv16i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: sub a2, a0, a1 ; RV64-NEXT: sltu a3, a0, a2 ; RV64-NEXT: addi a3, a3, -1 ; RV64-NEXT: and a2, a3, a2 ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 2, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 8, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 16, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: vsrl.vi v24, v16, 1 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsrl.vi v24, v16, 2 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsrl.vi v24, v16, 8 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsrl.vi v24, v16, 16 +; RV64-NEXT: vor.vv v16, v16, v24 ; RV64-NEXT: li a2, 32 -; RV64-NEXT: vsrl.vx v24, v16, a2, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vnot.v v16, v16, v0.t +; RV64-NEXT: vsrl.vx v24, v16, a2 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vnot.v v16, v16 ; RV64-NEXT: lui a3, %hi(.LCPI47_0) -; RV64-NEXT: ld a4, %lo(.LCPI47_0)(a3) -; RV64-NEXT: lui a3, %hi(.LCPI47_1) -; RV64-NEXT: ld a3, %lo(.LCPI47_1)(a3) -; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV64-NEXT: vand.vx v24, v24, a4, v0.t -; RV64-NEXT: vsub.vv v16, v16, v24, v0.t -; RV64-NEXT: vand.vx v24, v16, a3, v0.t -; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV64-NEXT: vand.vx v16, v16, a3, v0.t -; RV64-NEXT: vadd.vv v16, v24, v16, v0.t +; RV64-NEXT: ld a3, %lo(.LCPI47_0)(a3) +; RV64-NEXT: lui a4, %hi(.LCPI47_1) +; RV64-NEXT: ld a4, %lo(.LCPI47_1)(a4) +; RV64-NEXT: vsrl.vi v24, v16, 1 +; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vsub.vv v16, v16, v24 +; RV64-NEXT: vand.vx v24, v16, a4 +; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vand.vx v16, v16, a4 +; RV64-NEXT: vadd.vv v16, v24, v16 ; RV64-NEXT: lui a5, %hi(.LCPI47_2) ; RV64-NEXT: ld a5, %lo(.LCPI47_2)(a5) ; RV64-NEXT: lui a6, %hi(.LCPI47_3) ; RV64-NEXT: ld a6, %lo(.LCPI47_3)(a6) -; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV64-NEXT: vadd.vv v16, v16, v24, v0.t -; RV64-NEXT: vand.vx v16, v16, a5, v0.t -; RV64-NEXT: vmul.vx v16, v16, a6, v0.t +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vadd.vv v16, v16, v24 +; RV64-NEXT: vand.vx v16, v16, a5 +; RV64-NEXT: vmul.vx v16, v16, a6 ; RV64-NEXT: li a7, 56 -; RV64-NEXT: vsrl.vx v16, v16, a7, v0.t +; RV64-NEXT: vsrl.vx v16, v16, a7 ; RV64-NEXT: bltu a0, a1, .LBB47_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB47_2: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vx v24, v8, a2, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV64-NEXT: vand.vx v24, v24, a4, v0.t -; RV64-NEXT: vsub.vv v8, v8, v24, v0.t -; RV64-NEXT: vand.vx v24, v8, a3, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t -; RV64-NEXT: vadd.vv v8, v24, v8, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v24, v0.t -; RV64-NEXT: vand.vx v8, v8, a5, v0.t -; RV64-NEXT: vmul.vx v8, v8, a6, v0.t -; RV64-NEXT: vsrl.vx v8, v8, a7, v0.t +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsrl.vx v24, v8, a2 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vsub.vv v8, v8, v24 +; RV64-NEXT: vand.vx v24, v8, a4 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a4 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vand.vx v8, v8, a5 +; RV64-NEXT: vmul.vx v8, v8, a6 +; RV64-NEXT: vsrl.vx v8, v8, a7 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.ctlz.nxv16i64( %va, i1 false, %m, i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll index e76a4d2a857b0..22e534c34283c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll @@ -34,27 +34,25 @@ define @vp_cttz_nxv1i8( %va, @vp_cttz_nxv1i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_nxv1i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v9, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv1i8( %va, i1 false, %m, i32 %evl) ret %v @@ -90,27 +88,25 @@ define @vp_cttz_nxv2i8( %va, @vp_cttz_nxv2i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_nxv2i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v9, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv2i8( %va, i1 false, %m, i32 %evl) ret %v @@ -146,27 +142,25 @@ define @vp_cttz_nxv4i8( %va, @vp_cttz_nxv4i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_nxv4i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v9, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv4i8( %va, i1 false, %m, i32 %evl) ret %v @@ -202,27 +196,25 @@ define @vp_cttz_nxv8i8( %va, @vp_cttz_nxv8i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_nxv8i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v9, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv8i8( %va, i1 false, %m, i32 %evl) ret %v @@ -258,27 +250,25 @@ define @vp_cttz_nxv16i8( %va, @vp_cttz_nxv16i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_nxv16i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vsub.vx v10, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v10, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v10, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v10 +; CHECK-NEXT: vsrl.vi v10, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v10, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v10, v8, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v10, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v10, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv16i8( %va, i1 false, %m, i32 %evl) ret %v @@ -314,27 +304,25 @@ define @vp_cttz_nxv32i8( %va, @vp_cttz_nxv32i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_nxv32i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vsub.vx v12, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v12, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v12, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v12 +; CHECK-NEXT: vsrl.vi v12, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v12, v12, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v12, v0.t +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v12, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v12, v8, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v12, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v12, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv32i8( %va, i1 false, %m, i32 %evl) ret %v @@ -370,27 +358,25 @@ define @vp_cttz_nxv64i8( %va, @vp_cttz_nxv64i8_unmasked( %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_nxv64i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, m8, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vsub.vx v16, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v16, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v16, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v16 +; CHECK-NEXT: vsrl.vi v16, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v16, v16, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v16, v0.t +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v16, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v16, v8, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v16, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v16, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv64i8( %va, i1 false, %m, i32 %evl) ret %v @@ -461,64 +447,60 @@ define @vp_cttz_nxv1i16( %va, @vp_cttz_nxv1i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv1i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv1i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv1i16( %va, i1 false, %m, i32 %evl) ret %v @@ -589,64 +571,60 @@ define @vp_cttz_nxv2i16( %va, @vp_cttz_nxv2i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv2i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: lui a0, 5 +; RV32-NEXT: addi a0, a0, 1365 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv2i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv2i16( %va, i1 false, %m, i32 %evl) ret %v @@ -717,64 +695,60 @@ define @vp_cttz_nxv4i16( %va, @vp_cttz_nxv4i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv4i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv4i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv4i16( %va, i1 false, %m, i32 %evl) ret %v @@ -845,64 +819,60 @@ define @vp_cttz_nxv8i16( %va, @vp_cttz_nxv8i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv8i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsub.vx v10, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv8i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v10, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv8i16( %va, i1 false, %m, i32 %evl) ret %v @@ -973,64 +943,60 @@ define @vp_cttz_nxv16i16( %va, @vp_cttz_nxv16i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv16i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsub.vx v12, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv16i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v12, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv16i16( %va, i1 false, %m, i32 %evl) ret %v @@ -1101,64 +1067,60 @@ define @vp_cttz_nxv32i16( %va, @vp_cttz_nxv32i16_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv32i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv32i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv32i16( %va, i1 false, %m, i32 %evl) ret %v @@ -1231,66 +1193,62 @@ define @vp_cttz_nxv1i32( %va, @vp_cttz_nxv1i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv1i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv1i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv1i32( %va, i1 false, %m, i32 %evl) ret %v @@ -1363,66 +1321,62 @@ define @vp_cttz_nxv2i32( %va, @vp_cttz_nxv2i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv2i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv2i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv2i32( %va, i1 false, %m, i32 %evl) ret %v @@ -1495,66 +1449,62 @@ define @vp_cttz_nxv4i32( %va, @vp_cttz_nxv4i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv4i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsub.vx v10, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv4i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v10, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv4i32( %va, i1 false, %m, i32 %evl) ret %v @@ -1627,66 +1577,62 @@ define @vp_cttz_nxv8i32( %va, @vp_cttz_nxv8i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv8i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsub.vx v12, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv8i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v12, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv8i32( %va, i1 false, %m, i32 %evl) ret %v @@ -1759,66 +1705,62 @@ define @vp_cttz_nxv16i32( %va, @vp_cttz_nxv16i32_unmasked( %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_nxv16i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: addi a0, a0, 1365 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv16i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv16i32( %va, i1 false, %m, i32 %evl) ret %v @@ -1919,8 +1861,6 @@ define @vp_cttz_nxv1i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -1939,70 +1879,68 @@ define @vp_cttz_nxv1i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v10, v8, v9 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v8, v8, v9 ; RV32-NEXT: vsetvli a2, zero, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v9, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v9, v0.t +; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv1i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 ; RV64-NEXT: lui a0, %hi(.LCPI37_0) ; RV64-NEXT: ld a0, %lo(.LCPI37_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI37_1) ; RV64-NEXT: ld a1, %lo(.LCPI37_1)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: vand.vx v9, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 +; RV64-NEXT: vand.vx v9, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: lui a0, %hi(.LCPI37_2) ; RV64-NEXT: ld a0, %lo(.LCPI37_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI37_3) ; RV64-NEXT: ld a1, %lo(.LCPI37_3)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv1i64( %va, i1 false, %m, i32 %evl) ret %v @@ -2103,8 +2041,6 @@ define @vp_cttz_nxv2i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -2123,70 +2059,68 @@ define @vp_cttz_nxv2i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsub.vx v10, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v10, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v10, v0.t +; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv2i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v10 ; RV64-NEXT: lui a0, %hi(.LCPI39_0) ; RV64-NEXT: ld a0, %lo(.LCPI39_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI39_1) ; RV64-NEXT: ld a1, %lo(.LCPI39_1)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: vand.vx v10, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: vand.vx v10, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: lui a0, %hi(.LCPI39_2) ; RV64-NEXT: ld a0, %lo(.LCPI39_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI39_3) ; RV64-NEXT: ld a1, %lo(.LCPI39_3)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv2i64( %va, i1 false, %m, i32 %evl) ret %v @@ -2287,8 +2221,6 @@ define @vp_cttz_nxv4i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -2307,70 +2239,68 @@ define @vp_cttz_nxv4i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsub.vx v12, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vsetvli a2, zero, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v12, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t +; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv4i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v12, v0.t +; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v12 ; RV64-NEXT: lui a0, %hi(.LCPI41_0) ; RV64-NEXT: ld a0, %lo(.LCPI41_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI41_1) ; RV64-NEXT: ld a1, %lo(.LCPI41_1)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: vand.vx v12, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: lui a0, %hi(.LCPI41_2) ; RV64-NEXT: ld a0, %lo(.LCPI41_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI41_3) ; RV64-NEXT: ld a1, %lo(.LCPI41_3)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv4i64( %va, i1 false, %m, i32 %evl) ret %v @@ -2471,8 +2401,6 @@ define @vp_cttz_nxv7i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -2491,70 +2419,68 @@ define @vp_cttz_nxv7i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv7i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: lui a0, %hi(.LCPI43_0) ; RV64-NEXT: ld a0, %lo(.LCPI43_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI43_1) ; RV64-NEXT: ld a1, %lo(.LCPI43_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: lui a0, %hi(.LCPI43_2) ; RV64-NEXT: ld a0, %lo(.LCPI43_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI43_3) ; RV64-NEXT: ld a1, %lo(.LCPI43_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv7i64( %va, i1 false, %m, i32 %evl) ret %v @@ -2655,8 +2581,6 @@ define @vp_cttz_nxv8i64_unmasked( %va, i32 ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -2675,70 +2599,68 @@ define @vp_cttz_nxv8i64_unmasked( %va, i32 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: addi a1, sp, 8 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a1), zero ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv8i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: lui a0, %hi(.LCPI45_0) ; RV64-NEXT: ld a0, %lo(.LCPI45_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI45_1) ; RV64-NEXT: ld a1, %lo(.LCPI45_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: lui a0, %hi(.LCPI45_2) ; RV64-NEXT: ld a0, %lo(.LCPI45_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI45_3) ; RV64-NEXT: ld a1, %lo(.LCPI45_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv8i64( %va, i1 false, %m, i32 %evl) ret %v @@ -3039,12 +2961,9 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: slli a1, a1, 5 ; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; RV32-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: sw a1, 12(sp) @@ -3068,81 +2987,31 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: and a3, a3, a2 ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v24, v16, a2, v0.t -; RV32-NEXT: vnot.v v16, v16, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsub.vx v24, v16, a2 +; RV32-NEXT: vnot.v v16, v16 +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsrl.vi v24, v16, 1 ; RV32-NEXT: addi a4, sp, 8 ; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a4), zero +; RV32-NEXT: vlse64.v v0, (a4), zero ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV32-NEXT: csrr a5, vlenb ; RV32-NEXT: li a6, 24 ; RV32-NEXT: mul a5, a5, a6 ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v16, v24, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vv v16, v16, v24, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v0, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vsub.vv v16, v16, v24 ; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v24, (a4), zero +; RV32-NEXT: vlse64.v v0, (a4), zero ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v16, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 5 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: csrr a5, vlenb -; RV32-NEXT: slli a5, a5, 4 -; RV32-NEXT: add a5, sp, a5 -; RV32-NEXT: addi a5, a5, 16 -; RV32-NEXT: vl8r.v v24, (a5) # Unknown-size Folded Reload -; RV32-NEXT: vadd.vv v16, v24, v16, v0.t -; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v24, (a4), zero ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma @@ -3151,7 +3020,7 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: add a5, sp, a5 ; RV32-NEXT: addi a5, a5, 16 ; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v16, v24, v0.t +; RV32-NEXT: vand.vv v24, v16, v24 ; RV32-NEXT: vsetvli a5, zero, e64, m8, ta, ma ; RV32-NEXT: vlse64.v v16, (a4), zero ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma @@ -3160,9 +3029,9 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vmul.vv v24, v24, v16, v0.t +; RV32-NEXT: vmul.vv v24, v24, v16 ; RV32-NEXT: li a3, 56 -; RV32-NEXT: vsrl.vx v16, v24, a3, v0.t +; RV32-NEXT: vsrl.vx v16, v24, a3 ; RV32-NEXT: addi a4, sp, 16 ; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill ; RV32-NEXT: bltu a0, a1, .LBB47_2 @@ -3170,55 +3039,47 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: .LBB47_2: ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v24, v8, a2, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsub.vx v24, v8, a2 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v24 +; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v24, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 5 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v24, v0.t +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v8, v8, a3, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v8, v8, a3 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_nxv16i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: sub a2, a0, a1 ; RV64-NEXT: sltu a3, a0, a2 @@ -3226,52 +3087,52 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV64-NEXT: and a3, a3, a2 ; RV64-NEXT: li a2, 1 ; RV64-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v16, a2, v0.t -; RV64-NEXT: vnot.v v16, v16, v0.t -; RV64-NEXT: vand.vv v16, v16, v24, v0.t +; RV64-NEXT: vsub.vx v24, v16, a2 +; RV64-NEXT: vnot.v v16, v16 +; RV64-NEXT: vand.vv v16, v16, v24 ; RV64-NEXT: lui a3, %hi(.LCPI47_0) -; RV64-NEXT: ld a4, %lo(.LCPI47_0)(a3) -; RV64-NEXT: lui a3, %hi(.LCPI47_1) -; RV64-NEXT: ld a3, %lo(.LCPI47_1)(a3) -; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV64-NEXT: vand.vx v24, v24, a4, v0.t -; RV64-NEXT: vsub.vv v16, v16, v24, v0.t -; RV64-NEXT: vand.vx v24, v16, a3, v0.t -; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV64-NEXT: vand.vx v16, v16, a3, v0.t -; RV64-NEXT: vadd.vv v16, v24, v16, v0.t +; RV64-NEXT: ld a3, %lo(.LCPI47_0)(a3) +; RV64-NEXT: lui a4, %hi(.LCPI47_1) +; RV64-NEXT: ld a4, %lo(.LCPI47_1)(a4) +; RV64-NEXT: vsrl.vi v24, v16, 1 +; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vsub.vv v16, v16, v24 +; RV64-NEXT: vand.vx v24, v16, a4 +; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vand.vx v16, v16, a4 +; RV64-NEXT: vadd.vv v16, v24, v16 ; RV64-NEXT: lui a5, %hi(.LCPI47_2) ; RV64-NEXT: ld a5, %lo(.LCPI47_2)(a5) ; RV64-NEXT: lui a6, %hi(.LCPI47_3) ; RV64-NEXT: ld a6, %lo(.LCPI47_3)(a6) -; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV64-NEXT: vadd.vv v16, v16, v24, v0.t -; RV64-NEXT: vand.vx v16, v16, a5, v0.t -; RV64-NEXT: vmul.vx v16, v16, a6, v0.t +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vadd.vv v16, v16, v24 +; RV64-NEXT: vand.vx v16, v16, a5 +; RV64-NEXT: vmul.vx v16, v16, a6 ; RV64-NEXT: li a7, 56 -; RV64-NEXT: vsrl.vx v16, v16, a7, v0.t +; RV64-NEXT: vsrl.vx v16, v16, a7 ; RV64-NEXT: bltu a0, a1, .LBB47_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB47_2: ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v8, a2, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV64-NEXT: vand.vx v24, v24, a4, v0.t -; RV64-NEXT: vsub.vv v8, v8, v24, v0.t -; RV64-NEXT: vand.vx v24, v8, a3, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t -; RV64-NEXT: vadd.vv v8, v24, v8, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v24, v0.t -; RV64-NEXT: vand.vx v8, v8, a5, v0.t -; RV64-NEXT: vmul.vx v8, v8, a6, v0.t -; RV64-NEXT: vsrl.vx v8, v8, a7, v0.t +; RV64-NEXT: vsub.vx v24, v8, a2 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vx v24, v24, a3 +; RV64-NEXT: vsub.vv v8, v8, v24 +; RV64-NEXT: vand.vx v24, v8, a4 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a4 +; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vand.vx v8, v8, a5 +; RV64-NEXT: vmul.vx v8, v8, a6 +; RV64-NEXT: vsrl.vx v8, v8, a7 ; RV64-NEXT: ret - %head = insertelement poison, i1 false, i32 0 + %head = insertelement poison, i1 true, i32 0 %m = shufflevector %head, poison, zeroinitializer %v = call @llvm.vp.cttz.nxv16i64( %va, i1 false, %m, i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index e2e9fd8d0b495..4d363e0649725 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -37,30 +37,28 @@ define <2 x i8> @vp_ctlz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i8> @vp_ctlz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_v2i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement <2 x i1> poison, i1 false, i32 0 + %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer %v = call <2 x i8> @llvm.vp.ctlz.v2i8(<2 x i8> %va, i1 false, <2 x i1> %m, i32 %evl) ret <2 x i8> %v @@ -99,30 +97,28 @@ define <4 x i8> @vp_ctlz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i8> @vp_ctlz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_v4i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement <4 x i1> poison, i1 false, i32 0 + %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.ctlz.v4i8(<4 x i8> %va, i1 false, <4 x i1> %m, i32 %evl) ret <4 x i8> %v @@ -161,30 +157,28 @@ define <8 x i8> @vp_ctlz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i8> @vp_ctlz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_v8i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement <8 x i1> poison, i1 false, i32 0 + %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.ctlz.v8i8(<8 x i8> %va, i1 false, <8 x i1> %m, i32 %evl) ret <8 x i8> %v @@ -223,30 +217,28 @@ define <16 x i8> @vp_ctlz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { define <16 x i8> @vp_ctlz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ctlz_v16i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement <16 x i1> poison, i1 false, i32 0 + %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer %v = call <16 x i8> @llvm.vp.ctlz.v16i8(<16 x i8> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i8> %v @@ -327,74 +319,70 @@ define <2 x i16> @vp_ctlz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i16> @vp_ctlz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v2i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v2i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement <2 x i1> poison, i1 false, i32 0 + %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.ctlz.v2i16(<2 x i16> %va, i1 false, <2 x i1> %m, i32 %evl) ret <2 x i16> %v @@ -475,74 +463,70 @@ define <4 x i16> @vp_ctlz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i16> @vp_ctlz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v4i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t -; RV32-NEXT: lui a0, 1 +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v4i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement <4 x i1> poison, i1 false, i32 0 + %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.ctlz.v4i16(<4 x i16> %va, i1 false, <4 x i1> %m, i32 %evl) ret <4 x i16> %v @@ -623,74 +607,70 @@ define <8 x i16> @vp_ctlz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i16> @vp_ctlz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v8i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v8i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement <8 x i1> poison, i1 false, i32 0 + %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer %v = call <8 x i16> @llvm.vp.ctlz.v8i16(<8 x i16> %va, i1 false, <8 x i1> %m, i32 %evl) ret <8 x i16> %v @@ -771,74 +751,70 @@ define <16 x i16> @vp_ctlz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl define <16 x i16> @vp_ctlz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v16i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v16i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v10, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement <16 x i1> poison, i1 false, i32 0 + %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer %v = call <16 x i16> @llvm.vp.ctlz.v16i16(<16 x i16> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i16> %v @@ -925,80 +901,76 @@ define <2 x i32> @vp_ctlz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i32> @vp_ctlz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v2i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v2i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement <2 x i1> poison, i1 false, i32 0 + %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.ctlz.v2i32(<2 x i32> %va, i1 false, <2 x i1> %m, i32 %evl) ret <2 x i32> %v @@ -1085,80 +1057,76 @@ define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i32> @vp_ctlz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v4i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v4i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement <4 x i1> poison, i1 false, i32 0 + %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i32> @llvm.vp.ctlz.v4i32(<4 x i32> %va, i1 false, <4 x i1> %m, i32 %evl) ret <4 x i32> %v @@ -1245,80 +1213,76 @@ define <8 x i32> @vp_ctlz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i32> @vp_ctlz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v8i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v8i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v10, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement <8 x i1> poison, i1 false, i32 0 + %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer %v = call <8 x i32> @llvm.vp.ctlz.v8i32(<8 x i32> %va, i1 false, <8 x i1> %m, i32 %evl) ret <8 x i32> %v @@ -1405,80 +1369,76 @@ define <16 x i32> @vp_ctlz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl define <16 x i32> @vp_ctlz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v16i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vp_ctlz_v16i32_unmasked: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 -; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: lui a0, 4112 +; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_ctlz_v16i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vsrl.vi v12, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v12, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement <16 x i1> poison, i1 false, i32 0 + %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer %v = call <16 x i32> @llvm.vp.ctlz.v16i32(<16 x i32> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i32> %v @@ -1588,103 +1548,99 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v2i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t +; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v9, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v9, v0.t +; RV32-NEXT: vsrl.vx v9, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v9, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v10, v8, v9 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v8, v8, v9 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v9, v0.t +; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v2i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v9, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI25_0) ; RV64-NEXT: ld a0, %lo(.LCPI25_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI25_1) ; RV64-NEXT: ld a1, %lo(.LCPI25_1)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: vand.vx v9, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 +; RV64-NEXT: vand.vx v9, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: lui a0, %hi(.LCPI25_2) ; RV64-NEXT: ld a0, %lo(.LCPI25_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI25_3) ; RV64-NEXT: ld a1, %lo(.LCPI25_3)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <2 x i1> poison, i1 false, i32 0 + %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl) ret <2 x i64> %v @@ -1794,103 +1750,99 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v4i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vsrl.vx v10, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.i v10, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v10, v0.t +; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v4i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v10, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI27_0) ; RV64-NEXT: ld a0, %lo(.LCPI27_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI27_1) ; RV64-NEXT: ld a1, %lo(.LCPI27_1)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: vand.vx v10, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: vand.vx v10, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: lui a0, %hi(.LCPI27_2) ; RV64-NEXT: ld a0, %lo(.LCPI27_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI27_3) ; RV64-NEXT: ld a1, %lo(.LCPI27_3)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <4 x i1> poison, i1 false, i32 0 + %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl) ret <4 x i64> %v @@ -2000,103 +1952,99 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v8i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.i v12, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t +; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v8i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v12, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v12, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI29_0) ; RV64-NEXT: ld a0, %lo(.LCPI29_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI29_1) ; RV64-NEXT: ld a1, %lo(.LCPI29_1)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: vand.vx v12, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: lui a0, %hi(.LCPI29_2) ; RV64-NEXT: ld a0, %lo(.LCPI29_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI29_3) ; RV64-NEXT: ld a1, %lo(.LCPI29_3)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <8 x i1> poison, i1 false, i32 0 + %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl) ret <8 x i64> %v @@ -2206,103 +2154,99 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a2, 349525 ; RV32-NEXT: addi a2, a2, 1365 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: addi a2, a2, -241 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_ctlz_v15i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI31_0) ; RV64-NEXT: ld a0, %lo(.LCPI31_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI31_1) ; RV64-NEXT: ld a1, %lo(.LCPI31_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: lui a0, %hi(.LCPI31_2) ; RV64-NEXT: ld a0, %lo(.LCPI31_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI31_3) ; RV64-NEXT: ld a1, %lo(.LCPI31_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <15 x i1> poison, i1 false, i32 0 + %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl) ret <15 x i64> %v @@ -2412,103 +2356,99 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a2, 349525 ; RV32-NEXT: addi a2, a2, 1365 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: addi a2, a2, -241 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t -; RV32-NEXT: ret -; -; RV64-LABEL: vp_ctlz_v16i64_unmasked: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 -; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_ctlz_v16i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: li a0, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0, v0.t -; RV64-NEXT: vor.vv v8, v8, v16, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a0, %hi(.LCPI33_0) ; RV64-NEXT: ld a0, %lo(.LCPI33_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI33_1) ; RV64-NEXT: ld a1, %lo(.LCPI33_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: lui a0, %hi(.LCPI33_2) ; RV64-NEXT: ld a0, %lo(.LCPI33_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI33_3) ; RV64-NEXT: ld a1, %lo(.LCPI33_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <16 x i1> poison, i1 false, i32 0 + %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i64> %v @@ -2892,102 +2832,89 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_ctlz_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-NEXT: li a1, 16 -; RV32-NEXT: vmclr.m v0 +; RV32-NEXT: vmv8r.v v0, v16 ; RV32-NEXT: mv a2, a0 ; RV32-NEXT: bltu a0, a1, .LBB35_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB35_2: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, -1 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 40 -; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t +; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a3, 349525 ; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 5 +; RV32-NEXT: li a4, 24 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v16, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: addi a3, a3, 819 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: li a4, 24 -; RV32-NEXT: mul a3, a3, a4 -; RV32-NEXT: add a3, sp, a3 -; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vmv.v.x v16, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v16, v8, v16, v0.t +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: addi a3, a3, -241 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a3 +; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8, v0.t +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: lui a3, 4112 ; RV32-NEXT: addi a3, a3, 257 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v8, v24 ; RV32-NEXT: li a2, 56 -; RV32-NEXT: vsrl.vx v8, v8, a2, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 @@ -2998,68 +2925,56 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 48 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t -; RV32-NEXT: vor.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v8, v0, 1 +; RV32-NEXT: vor.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v0, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vx v0, v8, a1 +; RV32-NEXT: vor.vv v8, v8, v0 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vxor.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v16, v8, a2 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 @@ -3067,82 +2982,80 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_ctlz_v32i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64-NEXT: li a2, 16 -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: mv a1, a0 ; RV64-NEXT: bltu a0, a2, .LBB35_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB35_2: ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v8, 16, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vsrl.vi v24, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v24 ; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsrl.vx v24, v8, a1, v0.t -; RV64-NEXT: vor.vv v8, v8, v24, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t +; RV64-NEXT: vsrl.vx v24, v8, a1 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vnot.v v8, v8 ; RV64-NEXT: lui a2, %hi(.LCPI35_0) -; RV64-NEXT: ld a3, %lo(.LCPI35_0)(a2) -; RV64-NEXT: lui a2, %hi(.LCPI35_1) -; RV64-NEXT: ld a2, %lo(.LCPI35_1)(a2) -; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV64-NEXT: vand.vx v24, v24, a3, v0.t -; RV64-NEXT: vsub.vv v8, v8, v24, v0.t -; RV64-NEXT: vand.vx v24, v8, a2, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a2, v0.t -; RV64-NEXT: vadd.vv v8, v24, v8, v0.t +; RV64-NEXT: ld a2, %lo(.LCPI35_0)(a2) +; RV64-NEXT: lui a3, %hi(.LCPI35_1) +; RV64-NEXT: ld a3, %lo(.LCPI35_1)(a3) +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vx v24, v24, a2 +; RV64-NEXT: vsub.vv v8, v8, v24 +; RV64-NEXT: vand.vx v24, v8, a3 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vadd.vv v8, v24, v8 ; RV64-NEXT: lui a4, %hi(.LCPI35_2) ; RV64-NEXT: ld a4, %lo(.LCPI35_2)(a4) ; RV64-NEXT: lui a5, %hi(.LCPI35_3) ; RV64-NEXT: ld a5, %lo(.LCPI35_3)(a5) -; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v24, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vand.vx v8, v8, a4 +; RV64-NEXT: vmul.vx v8, v8, a5 ; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a6 ; RV64-NEXT: addi a7, a0, -16 ; RV64-NEXT: sltu a0, a0, a7 ; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: and a0, a0, a7 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 2, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 8, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 16, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vx v24, v16, a1, v0.t -; RV64-NEXT: vor.vv v16, v16, v24, v0.t -; RV64-NEXT: vnot.v v16, v16, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV64-NEXT: vand.vx v24, v24, a3, v0.t -; RV64-NEXT: vsub.vv v16, v16, v24, v0.t -; RV64-NEXT: vand.vx v24, v16, a2, v0.t -; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t -; RV64-NEXT: vadd.vv v16, v24, v16, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV64-NEXT: vadd.vv v16, v16, v24, v0.t -; RV64-NEXT: vand.vx v16, v16, a4, v0.t -; RV64-NEXT: vmul.vx v16, v16, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v16, a6, v0.t +; RV64-NEXT: vsrl.vi v24, v16, 1 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsrl.vi v24, v16, 2 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsrl.vi v24, v16, 8 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsrl.vi v24, v16, 16 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vsrl.vx v24, v16, a1 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vnot.v v16, v16 +; RV64-NEXT: vsrl.vi v24, v16, 1 +; RV64-NEXT: vand.vx v24, v24, a2 +; RV64-NEXT: vsub.vv v16, v16, v24 +; RV64-NEXT: vand.vx v24, v16, a3 +; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vadd.vv v16, v16, v24 +; RV64-NEXT: vand.vx v16, v16, a4 +; RV64-NEXT: vmul.vx v16, v16, a5 +; RV64-NEXT: vsrl.vx v16, v16, a6 ; RV64-NEXT: ret - %head = insertelement <32 x i1> poison, i1 false, i32 0 + %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> %m, i32 %evl) ret <32 x i64> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index 02e8b0c9d4861..8703b14614485 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -34,27 +34,25 @@ define <2 x i8> @vp_cttz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i8> @vp_cttz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_v2i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v9, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement <2 x i1> poison, i1 false, i32 0 + %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer %v = call <2 x i8> @llvm.vp.cttz.v2i8(<2 x i8> %va, i1 false, <2 x i1> %m, i32 %evl) ret <2 x i8> %v @@ -90,27 +88,25 @@ define <4 x i8> @vp_cttz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i8> @vp_cttz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_v4i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v9, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement <4 x i1> poison, i1 false, i32 0 + %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i8> @llvm.vp.cttz.v4i8(<4 x i8> %va, i1 false, <4 x i1> %m, i32 %evl) ret <4 x i8> %v @@ -146,27 +142,25 @@ define <8 x i8> @vp_cttz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i8> @vp_cttz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_v8i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v9, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement <8 x i1> poison, i1 false, i32 0 + %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer %v = call <8 x i8> @llvm.vp.cttz.v8i8(<8 x i8> %va, i1 false, <8 x i1> %m, i32 %evl) ret <8 x i8> %v @@ -202,27 +196,25 @@ define <16 x i8> @vp_cttz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) { define <16 x i8> @vp_cttz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_cttz_v16i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmclr.m v0 ; CHECK-NEXT: li a1, 1 ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vsub.vx v9, v8, a1, v0.t -; CHECK-NEXT: vnot.v v8, v8, v0.t -; CHECK-NEXT: vand.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t +; CHECK-NEXT: vsub.vx v9, v8, a1 +; CHECK-NEXT: vnot.v v8, v8 +; CHECK-NEXT: vand.vv v8, v8, v9 +; CHECK-NEXT: vsrl.vi v9, v8, 1 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v9, a0 +; CHECK-NEXT: vsub.vv v8, v8, v9 ; CHECK-NEXT: li a0, 51 -; CHECK-NEXT: vand.vx v9, v8, a0, v0.t -; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vadd.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v9, v0.t -; CHECK-NEXT: vand.vi v8, v8, 15, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: ret - %head = insertelement <16 x i1> poison, i1 false, i32 0 + %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer %v = call <16 x i8> @llvm.vp.cttz.v16i8(<16 x i8> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i8> %v @@ -293,64 +285,60 @@ define <2 x i16> @vp_cttz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i16> @vp_cttz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v2i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v2i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement <2 x i1> poison, i1 false, i32 0 + %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer %v = call <2 x i16> @llvm.vp.cttz.v2i16(<2 x i16> %va, i1 false, <2 x i1> %m, i32 %evl) ret <2 x i16> %v @@ -421,64 +409,60 @@ define <4 x i16> @vp_cttz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i16> @vp_cttz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v4i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v4i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement <4 x i1> poison, i1 false, i32 0 + %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i16> @llvm.vp.cttz.v4i16(<4 x i16> %va, i1 false, <4 x i1> %m, i32 %evl) ret <4 x i16> %v @@ -549,64 +533,60 @@ define <8 x i16> @vp_cttz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i16> @vp_cttz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v8i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v8i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement <8 x i1> poison, i1 false, i32 0 + %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer %v = call <8 x i16> @llvm.vp.cttz.v8i16(<8 x i16> %va, i1 false, <8 x i1> %m, i32 %evl) ret <8 x i16> %v @@ -677,64 +657,60 @@ define <16 x i16> @vp_cttz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl define <16 x i16> @vp_cttz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v16i16_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsub.vx v10, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: li a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v16i16_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v10, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: li a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %head = insertelement <16 x i1> poison, i1 false, i32 0 + %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer %v = call <16 x i16> @llvm.vp.cttz.v16i16(<16 x i16> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i16> %v @@ -807,66 +783,62 @@ define <2 x i32> @vp_cttz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i32> @vp_cttz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v2i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v2i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement <2 x i1> poison, i1 false, i32 0 + %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer %v = call <2 x i32> @llvm.vp.cttz.v2i32(<2 x i32> %va, i1 false, <2 x i1> %m, i32 %evl) ret <2 x i32> %v @@ -939,66 +911,62 @@ define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i32> @vp_cttz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v4i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v4i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement <4 x i1> poison, i1 false, i32 0 + %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i32> @llvm.vp.cttz.v4i32(<4 x i32> %va, i1 false, <4 x i1> %m, i32 %evl) ret <4 x i32> %v @@ -1071,66 +1039,62 @@ define <8 x i32> @vp_cttz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i32> @vp_cttz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v8i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vsub.vx v10, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v8i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v10, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v10, v8 +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement <8 x i1> poison, i1 false, i32 0 + %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer %v = call <8 x i32> @llvm.vp.cttz.v8i32(<8 x i32> %va, i1 false, <8 x i1> %m, i32 %evl) ret <8 x i32> %v @@ -1203,66 +1167,62 @@ define <16 x i32> @vp_cttz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl define <16 x i32> @vp_cttz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v16i32_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1, v0.t -; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vsub.vx v12, v8, a1 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vx v8, v8, a0, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0, v0.t +; RV32-NEXT: vand.vx v8, v8, a0 ; RV32-NEXT: lui a0, 4112 ; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV32-NEXT: vmul.vx v8, v8, a0 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v16i32_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v12, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 1 ; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 ; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v12, v8 +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: lui a0, 4112 ; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 24, v0.t +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %head = insertelement <16 x i1> poison, i1 false, i32 0 + %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer %v = call <16 x i32> @llvm.vp.cttz.v16i32(<16 x i32> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i32> %v @@ -1352,83 +1312,79 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v2i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vsub.vx v9, v8, a1, v0.t +; RV32-NEXT: vsub.vx v9, v8, a1 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v10, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v10, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v10 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v9, v9, v10, v0.t -; RV32-NEXT: vsub.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v9, v10 +; RV32-NEXT: vsub.vv v8, v8, v9 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v10, v8, v9, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vadd.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v10, v8, v9 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v9 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vand.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v8, v8, v9 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v9, v0.t +; RV32-NEXT: vmul.vv v8, v8, v9 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v2i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; RV64-NEXT: vsub.vx v9, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v9, v0.t +; RV64-NEXT: vsub.vx v9, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v9 ; RV64-NEXT: lui a0, %hi(.LCPI25_0) ; RV64-NEXT: ld a0, %lo(.LCPI25_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI25_1) ; RV64-NEXT: ld a1, %lo(.LCPI25_1)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vand.vx v9, v9, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v9, v0.t -; RV64-NEXT: vand.vx v9, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v9, v8, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsub.vv v8, v8, v9 +; RV64-NEXT: vand.vx v9, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: lui a0, %hi(.LCPI25_2) ; RV64-NEXT: ld a0, %lo(.LCPI25_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI25_3) ; RV64-NEXT: ld a1, %lo(.LCPI25_3)(a1) -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v9, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <2 x i1> poison, i1 false, i32 0 + %head = insertelement <2 x i1> poison, i1 true, i32 0 %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl) ret <2 x i64> %v @@ -1518,83 +1474,79 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v4i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vsub.vx v10, v8, a1, v0.t +; RV32-NEXT: vsub.vx v10, v8, a1 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.i v12, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v12, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v12 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v10, v10, v12, v0.t -; RV32-NEXT: vsub.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v10, v12 +; RV32-NEXT: vsub.vv v8, v8, v10 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v12, v8, v10, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vadd.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v12, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v12, v8 +; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vand.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v10, v0.t +; RV32-NEXT: vmul.vv v8, v8, v10 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v4i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; RV64-NEXT: vsub.vx v10, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v10, v0.t +; RV64-NEXT: vsub.vx v10, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v10 ; RV64-NEXT: lui a0, %hi(.LCPI27_0) ; RV64-NEXT: ld a0, %lo(.LCPI27_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI27_1) ; RV64-NEXT: ld a1, %lo(.LCPI27_1)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vand.vx v10, v10, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v10, v0.t -; RV64-NEXT: vand.vx v10, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v10, v8, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsub.vv v8, v8, v10 +; RV64-NEXT: vand.vx v10, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: lui a0, %hi(.LCPI27_2) ; RV64-NEXT: ld a0, %lo(.LCPI27_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI27_3) ; RV64-NEXT: ld a1, %lo(.LCPI27_3)(a1) -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v10, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <4 x i1> poison, i1 false, i32 0 + %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl) ret <4 x i64> %v @@ -1684,83 +1636,79 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v8i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vsub.vx v12, v8, a1, v0.t +; RV32-NEXT: vsub.vx v12, v8, a1 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.i v16, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v16, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v16 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a1, a1, 1365 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v12, v12, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v12, v12, v16 +; RV32-NEXT: vsub.vv v8, v8, v12 ; RV32-NEXT: lui a1, 209715 ; RV32-NEXT: addi a1, a1, 819 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v16, v8, v12, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v16, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: lui a1, 61681 ; RV32-NEXT: addi a1, a1, -241 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: lui a1, 4112 ; RV32-NEXT: addi a1, a1, 257 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v12, v0.t +; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v8i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; RV64-NEXT: vsub.vx v12, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v12, v0.t +; RV64-NEXT: vsub.vx v12, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v12 ; RV64-NEXT: lui a0, %hi(.LCPI29_0) ; RV64-NEXT: ld a0, %lo(.LCPI29_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI29_1) ; RV64-NEXT: ld a1, %lo(.LCPI29_1)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vand.vx v12, v12, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v12, v0.t -; RV64-NEXT: vand.vx v12, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v12, v8, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsub.vv v8, v8, v12 +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: lui a0, %hi(.LCPI29_2) ; RV64-NEXT: ld a0, %lo(.LCPI29_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI29_3) ; RV64-NEXT: ld a1, %lo(.LCPI29_3)(a1) -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v12, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <8 x i1> poison, i1 false, i32 0 + %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl) ret <8 x i64> %v @@ -1851,84 +1799,80 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v15i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v24, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a2, 349525 ; RV32-NEXT: addi a2, a2, 1365 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: addi a2, a2, -241 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v15i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: lui a0, %hi(.LCPI31_0) ; RV64-NEXT: ld a0, %lo(.LCPI31_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI31_1) ; RV64-NEXT: ld a1, %lo(.LCPI31_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: lui a0, %hi(.LCPI31_2) ; RV64-NEXT: ld a0, %lo(.LCPI31_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI31_3) ; RV64-NEXT: ld a1, %lo(.LCPI31_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <15 x i1> poison, i1 false, i32 0 + %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl) ret <15 x i64> %v @@ -2019,84 +1963,80 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v16i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vmclr.m v0 ; RV32-NEXT: li a1, 1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: vsub.vx v16, v8, a1 ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v24, -1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a2, 349525 ; RV32-NEXT: addi a2, a2, 1365 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vadd.vv v8, v24, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: addi a2, a2, -241 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: lui a2, 4112 ; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 ; RV32-NEXT: li a0, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV32-NEXT: vsrl.vx v8, v8, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: vp_cttz_v16i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: li a1, 1 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v16, v8, a1, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v16, v0.t +; RV64-NEXT: vsub.vx v16, v8, a1 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v16 ; RV64-NEXT: lui a0, %hi(.LCPI33_0) ; RV64-NEXT: ld a0, %lo(.LCPI33_0)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI33_1) ; RV64-NEXT: ld a1, %lo(.LCPI33_1)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t -; RV64-NEXT: vsub.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v16, v8, a1, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vadd.vv v8, v16, v8, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a1 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: lui a0, %hi(.LCPI33_2) ; RV64-NEXT: ld a0, %lo(.LCPI33_2)(a0) ; RV64-NEXT: lui a1, %hi(.LCPI33_3) ; RV64-NEXT: ld a1, %lo(.LCPI33_3)(a1) -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v16, v0.t -; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vmul.vx v8, v8, a1, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vmul.vx v8, v8, a1 ; RV64-NEXT: li a0, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %head = insertelement <16 x i1> poison, i1 false, i32 0 + %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl) ret <16 x i64> %v @@ -2466,93 +2406,80 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-LABEL: vp_cttz_v32i64_unmasked: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 56 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: li a1, 16 -; RV32-NEXT: vmclr.m v0 -; RV32-NEXT: mv a2, a0 -; RV32-NEXT: bltu a0, a1, .LBB35_2 -; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 +; RV32-NEXT: vmv8r.v v0, v16 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB35_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB35_2: -; RV32-NEXT: li a1, 1 -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 40 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; RV32-NEXT: li a2, 1 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsub.vx v16, v8, a2 ; RV32-NEXT: li a3, 32 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.i v24, -1 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 40 -; RV32-NEXT: mul a4, a4, a5 +; RV32-NEXT: slli a4, a4, 5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vxor.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vxor.vv v8, v8, v24 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a4, 349525 ; RV32-NEXT: addi a4, a4, 1365 -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v16, v8, v16, v0.t +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a4, 209715 ; RV32-NEXT: addi a4, a4, 819 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 24 -; RV32-NEXT: mul a4, a4, a5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t -; RV32-NEXT: vadd.vv v16, v24, v16, v0.t -; RV32-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV32-NEXT: vadd.vv v16, v16, v24, v0.t +; RV32-NEXT: vmv.v.x v16, a4 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: lui a4, 61681 ; RV32-NEXT: addi a4, a4, -241 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: slli a4, a4, 4 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a4, a4, 257 ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: addi a3, sp, 16 -; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v16, v16, v8, v0.t -; RV32-NEXT: li a2, 56 -; RV32-NEXT: vsrl.vx v8, v16, a2, v0.t +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v24 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsrl.vx v8, v8, a1 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 3 ; RV32-NEXT: add a3, sp, a3 @@ -2564,57 +2491,45 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: and a0, a0, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a3, 48 -; RV32-NEXT: mul a0, a0, a3 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vsub.vx v16, v8, a1, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vxor.vv v8, v8, v24, v0.t -; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v8, v8, v16, v0.t +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vxor.vv v8, v0, v8 +; RV32-NEXT: vsub.vx v0, v0, a2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vsrl.vi v0, v8, 1 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 24 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v16, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v0, v0, v24 +; RV32-NEXT: vsub.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v8, v16, v0.t -; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsrl.vx v16, v8, a1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 56 +; RV32-NEXT: li a1, 40 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 @@ -2622,9 +2537,7 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; ; RV64-LABEL: vp_cttz_v32i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64-NEXT: li a2, 16 -; RV64-NEXT: vmclr.m v0 ; RV64-NEXT: mv a1, a0 ; RV64-NEXT: bltu a0, a2, .LBB35_2 ; RV64-NEXT: # %bb.1: @@ -2632,52 +2545,52 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV64-NEXT: .LBB35_2: ; RV64-NEXT: li a2, 1 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v8, a2, v0.t -; RV64-NEXT: vnot.v v8, v8, v0.t -; RV64-NEXT: vand.vv v8, v8, v24, v0.t +; RV64-NEXT: vsub.vx v24, v8, a2 +; RV64-NEXT: vnot.v v8, v8 +; RV64-NEXT: vand.vv v8, v8, v24 ; RV64-NEXT: lui a1, %hi(.LCPI35_0) ; RV64-NEXT: ld a1, %lo(.LCPI35_0)(a1) ; RV64-NEXT: lui a3, %hi(.LCPI35_1) ; RV64-NEXT: ld a3, %lo(.LCPI35_1)(a3) -; RV64-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV64-NEXT: vand.vx v24, v24, a1, v0.t -; RV64-NEXT: vsub.vv v8, v8, v24, v0.t -; RV64-NEXT: vand.vx v24, v8, a3, v0.t -; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV64-NEXT: vand.vx v8, v8, a3, v0.t -; RV64-NEXT: vadd.vv v8, v24, v8, v0.t +; RV64-NEXT: vsrl.vi v24, v8, 1 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vsub.vv v8, v8, v24 +; RV64-NEXT: vand.vx v24, v8, a3 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vand.vx v8, v8, a3 +; RV64-NEXT: vadd.vv v8, v24, v8 ; RV64-NEXT: lui a4, %hi(.LCPI35_2) ; RV64-NEXT: ld a4, %lo(.LCPI35_2)(a4) ; RV64-NEXT: lui a5, %hi(.LCPI35_3) ; RV64-NEXT: ld a5, %lo(.LCPI35_3)(a5) -; RV64-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV64-NEXT: vadd.vv v8, v8, v24, v0.t -; RV64-NEXT: vand.vx v8, v8, a4, v0.t -; RV64-NEXT: vmul.vx v8, v8, a5, v0.t +; RV64-NEXT: vsrl.vi v24, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v24 +; RV64-NEXT: vand.vx v8, v8, a4 +; RV64-NEXT: vmul.vx v8, v8, a5 ; RV64-NEXT: li a6, 56 -; RV64-NEXT: vsrl.vx v8, v8, a6, v0.t +; RV64-NEXT: vsrl.vx v8, v8, a6 ; RV64-NEXT: addi a7, a0, -16 ; RV64-NEXT: sltu a0, a0, a7 ; RV64-NEXT: addi a0, a0, -1 ; RV64-NEXT: and a0, a0, a7 ; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV64-NEXT: vsub.vx v24, v16, a2, v0.t -; RV64-NEXT: vnot.v v16, v16, v0.t -; RV64-NEXT: vand.vv v16, v16, v24, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV64-NEXT: vand.vx v24, v24, a1, v0.t -; RV64-NEXT: vsub.vv v16, v16, v24, v0.t -; RV64-NEXT: vand.vx v24, v16, a3, v0.t -; RV64-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV64-NEXT: vand.vx v16, v16, a3, v0.t -; RV64-NEXT: vadd.vv v16, v24, v16, v0.t -; RV64-NEXT: vsrl.vi v24, v16, 4, v0.t -; RV64-NEXT: vadd.vv v16, v16, v24, v0.t -; RV64-NEXT: vand.vx v16, v16, a4, v0.t -; RV64-NEXT: vmul.vx v16, v16, a5, v0.t -; RV64-NEXT: vsrl.vx v16, v16, a6, v0.t +; RV64-NEXT: vsub.vx v24, v16, a2 +; RV64-NEXT: vnot.v v16, v16 +; RV64-NEXT: vand.vv v16, v16, v24 +; RV64-NEXT: vsrl.vi v24, v16, 1 +; RV64-NEXT: vand.vx v24, v24, a1 +; RV64-NEXT: vsub.vv v16, v16, v24 +; RV64-NEXT: vand.vx v24, v16, a3 +; RV64-NEXT: vsrl.vi v16, v16, 2 +; RV64-NEXT: vand.vx v16, v16, a3 +; RV64-NEXT: vadd.vv v16, v24, v16 +; RV64-NEXT: vsrl.vi v24, v16, 4 +; RV64-NEXT: vadd.vv v16, v16, v24 +; RV64-NEXT: vand.vx v16, v16, a4 +; RV64-NEXT: vmul.vx v16, v16, a5 +; RV64-NEXT: vsrl.vx v16, v16, a6 ; RV64-NEXT: ret - %head = insertelement <32 x i1> poison, i1 false, i32 0 + %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 false, <32 x i1> %m, i32 %evl) ret <32 x i64> %v From d9118b9eea7e95034502630bb3e0002625228825 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Tue, 30 May 2023 08:05:39 +0200 Subject: [PATCH 044/704] [bazel] Port for 9f6250f591057e68c0bda564716b6918b8e39a84, part2. The part1 was missing the generation of arm_sme_draft_spec_subject_to_change.h, this patch adds it. --- .../llvm-project-overlay/clang/BUILD.bazel | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 0ef8e1ad23ad7..6965f79d8a9bd 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1620,6 +1620,24 @@ gentbl( ], ) +gentbl( + name = "headers_arm_sme_draft_spec_subject_to_change_gen", + copts = [ + "-Wno-implicit-fallthrough", + "-Wno-error=frame-larger-than=", + ], + tbl_outs = [( + "-gen-arm-sme-header", + "lib/Headers/arm_sme_draft_spec_subject_to_change.h", + )], + tblgen = ":clang-tblgen", + td_file = "include/clang/Basic/arm_sme.td", + td_srcs = [ + "include/clang/Basic/arm_sme.td", + "include/clang/Basic/arm_sve_sme_incl.td", + ], +) + gentbl( name = "headers_riscv_vector", tbl_outs = [( @@ -1651,6 +1669,7 @@ builtin_headers = glob( "lib/Headers/arm_mve.h", "lib/Headers/arm_neon.h", "lib/Headers/arm_sve.h", + "lib/Headers/arm_sme_draft_spec_subject_to_change.h", "lib/Headers/arm_bf16.h", "lib/Headers/module.modulemap", "lib/Headers/riscv_vector.h", From bf916aeebd8c96c0618a1ccbb01a1517710f8766 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Tue, 30 May 2023 08:38:49 +0200 Subject: [PATCH 045/704] [clang] Solidate the implicit-module-header-maps.cpp lit test. On some systems, the second `split-file` command will fail because the current directory is the one we have deleted, we switch to another directory before running it. --- clang/test/Modules/implicit-module-header-maps.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/test/Modules/implicit-module-header-maps.cpp b/clang/test/Modules/implicit-module-header-maps.cpp index a190ff78f306f..bd4aeb640fb3c 100644 --- a/clang/test/Modules/implicit-module-header-maps.cpp +++ b/clang/test/Modules/implicit-module-header-maps.cpp @@ -7,6 +7,7 @@ // // RUN: %clang -Rmodule-build -fmodules -fimplicit-modules -fimplicit-module-maps -fmodule-map-file=module.modulemap -fsyntax-only -I hmap -fmodules-cache-path=%t test.cpp // +// RUN: cd %T // RUN: rm -rf %t // RUN: split-file %s %t // RUN: cd %t From 6fb26348e91e04b39aed38bf36c5603c48aa427d Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 18 May 2023 01:03:32 -0700 Subject: [PATCH 046/704] [LSAN] Move ThreadCreate into child thread Speeds up thread creation. Similar approach is already used by other sanitizers. --- compiler-rt/lib/lsan/lsan_interceptors.cpp | 28 +++++++++------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp index fe852b9fcfa8d..ade9dfdab40c4 100644 --- a/compiler-rt/lib/lsan/lsan_interceptors.cpp +++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp @@ -415,8 +415,10 @@ INTERCEPTOR(char *, strerror, int errnum) { #if SANITIZER_POSIX -extern "C" void *__lsan_thread_start_func(void *arg) { - atomic_uintptr_t *atomic_tid = (atomic_uintptr_t *)arg; +template +static void *ThreadStartFunc(void *arg) { + u32 parent_tid = (uptr)arg; + uptr tid = ThreadCreate(parent_tid, Detached); // Wait until the last iteration to maximize the chance that we are the last // destructor to run. #if !SANITIZER_NETBSD && !SANITIZER_FREEBSD @@ -425,12 +427,8 @@ extern "C" void *__lsan_thread_start_func(void *arg) { Report("LeakSanitizer: failed to set thread key.\n"); Die(); } -#endif - int tid = 0; - while ((tid = atomic_load(atomic_tid, memory_order_acquire)) == 0) - internal_sched_yield(); +# endif ThreadStart(tid, GetTid()); - atomic_store(atomic_tid, 0, memory_order_release); auto self = GetThreadSelf(); auto args = GetThreadArgRetval().GetArgs(self); void *retval = (*args.routine)(args.arg_retval); @@ -442,17 +440,19 @@ INTERCEPTOR(int, pthread_create, void *th, void *attr, void *(*callback)(void *), void *param) { ENSURE_LSAN_INITED; EnsureMainThreadIDIsCorrect(); + bool detached = [attr]() { int d = 0; return attr && !pthread_attr_getdetachstate(attr, &d) && IsStateDetached(d); }(); + __sanitizer_pthread_attr_t myattr; if (!attr) { pthread_attr_init(&myattr); attr = &myattr; } AdjustStackSize(attr); - atomic_uintptr_t atomic_tid = {}; + uptr this_tid = GetCurrentThreadId(); int result; { // Ignore all allocations made by pthread_create: thread stack/TLS may be @@ -461,18 +461,12 @@ INTERCEPTOR(int, pthread_create, void *th, void *attr, // objects, the latter are calculated by obscure pointer arithmetic. ScopedInterceptorDisabler disabler; GetThreadArgRetval().Create(detached, {callback, param}, [&]() -> uptr { - result = - REAL(pthread_create)(th, attr, __lsan_thread_start_func, &atomic_tid); + result = REAL(pthread_create)( + th, attr, detached ? ThreadStartFunc : ThreadStartFunc, + (void *)this_tid); return result ? 0 : *(uptr *)(th); }); } - if (result == 0) { - int tid = ThreadCreate(GetCurrentThreadId(), detached); - CHECK_NE(tid, kMainTid); - atomic_store(&atomic_tid, tid, memory_order_release); - while (atomic_load(&atomic_tid, memory_order_acquire) != 0) - internal_sched_yield(); - } if (attr == &myattr) pthread_attr_destroy(&myattr); return result; From 216e2820f96a59a0e86198fc4adea2dc0fc57a9b Mon Sep 17 00:00:00 2001 From: Shao-Ce SUN Date: Sat, 27 May 2023 01:42:48 +0800 Subject: [PATCH 047/704] [RISCV] Add more tests in zdinx-boundary-check.ll Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D151534 --- .../CodeGen/RISCV/zdinx-boundary-check.ll | 47 +++++++++++++++---- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll index 166eaca1d831b..940e022ecfdb3 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll @@ -4,11 +4,10 @@ ; RUN: llc -mtriple=riscv64 -mattr=+zdinx -verify-machineinstrs < %s \ ; RUN: -target-abi=lp64 | FileCheck -check-prefix=RV64ZDINX %s -define void @foo(ptr nocapture %p, double %d) { +define void @foo(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: .cfi_def_cfa_offset 16 ; RV32ZDINX-NEXT: sw a1, 8(sp) ; RV32ZDINX-NEXT: sw a2, 12(sp) ; RV32ZDINX-NEXT: lw a2, 8(sp) @@ -29,11 +28,10 @@ entry: ret void } -define void @foo2(ptr nocapture %p, double %d) { +define void @foo2(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo2: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: .cfi_def_cfa_offset 16 ; RV32ZDINX-NEXT: sw a1, 8(sp) ; RV32ZDINX-NEXT: sw a2, 12(sp) ; RV32ZDINX-NEXT: lw a2, 8(sp) @@ -59,7 +57,7 @@ entry: @d = global double 4.2, align 8 -define void @foo3(ptr nocapture %p) { +define void @foo3(ptr nocapture %p) nounwind { ; RV32ZDINX-LABEL: foo3: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: lui a1, %hi(d) @@ -83,11 +81,10 @@ entry: ret void } -define void @foo4(ptr %p) { +define void @foo4(ptr %p) nounwind { ; RV32ZDINX-LABEL: foo4: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: .cfi_def_cfa_offset 16 ; RV32ZDINX-NEXT: sw a0, 8(sp) ; RV32ZDINX-NEXT: addi a0, a0, 2047 ; RV32ZDINX-NEXT: lw a1, 1(a0) @@ -101,7 +98,6 @@ define void @foo4(ptr %p) { ; RV64ZDINX-LABEL: foo4: ; RV64ZDINX: # %bb.0: # %entry ; RV64ZDINX-NEXT: addi sp, sp, -16 -; RV64ZDINX-NEXT: .cfi_def_cfa_offset 16 ; RV64ZDINX-NEXT: sd a0, 8(sp) ; RV64ZDINX-NEXT: ld a0, 2044(a0) ; RV64ZDINX-NEXT: lui a1, %hi(d) @@ -118,11 +114,10 @@ entry: ret void } -define void @foo5(ptr nocapture %p, double %d) { +define void @foo5(ptr nocapture %p, double %d) nounwind { ; RV32ZDINX-LABEL: foo5: ; RV32ZDINX: # %bb.0: # %entry ; RV32ZDINX-NEXT: addi sp, sp, -16 -; RV32ZDINX-NEXT: .cfi_def_cfa_offset 16 ; RV32ZDINX-NEXT: sw a1, 8(sp) ; RV32ZDINX-NEXT: sw a2, 12(sp) ; RV32ZDINX-NEXT: lw a2, 8(sp) @@ -143,3 +138,35 @@ entry: store double %d, ptr %add.ptr, align 8 ret void } + +define void @foo6(ptr %p, double %d) nounwind { +; RV32ZDINX-LABEL: foo6: +; RV32ZDINX: # %bb.0: # %entry +; RV32ZDINX-NEXT: addi sp, sp, -16 +; RV32ZDINX-NEXT: sw a1, 8(sp) +; RV32ZDINX-NEXT: sw a2, 12(sp) +; RV32ZDINX-NEXT: lw a2, 8(sp) +; RV32ZDINX-NEXT: lw a3, 12(sp) +; RV32ZDINX-NEXT: lui a1, %hi(.LCPI5_0) +; RV32ZDINX-NEXT: lw a4, %lo(.LCPI5_0)(a1) +; RV32ZDINX-NEXT: lw a5, %lo(.LCPI5_0+4)(a1) +; RV32ZDINX-NEXT: fadd.d a2, a2, a4 +; RV32ZDINX-NEXT: addi a0, a0, 2047 +; RV32ZDINX-NEXT: sw a2, -3(a0) +; RV32ZDINX-NEXT: sw a3, 1(a0) +; RV32ZDINX-NEXT: addi sp, sp, 16 +; RV32ZDINX-NEXT: ret +; +; RV64ZDINX-LABEL: foo6: +; RV64ZDINX: # %bb.0: # %entry +; RV64ZDINX-NEXT: lui a2, %hi(.LCPI5_0) +; RV64ZDINX-NEXT: ld a2, %lo(.LCPI5_0)(a2) +; RV64ZDINX-NEXT: fadd.d a1, a1, a2 +; RV64ZDINX-NEXT: sd a1, 2044(a0) +; RV64ZDINX-NEXT: ret +entry: + %add = fadd double %d, 3.140000e+00 + %add.ptr = getelementptr inbounds i8, ptr %p, i64 2044 + store double %add, ptr %add.ptr, align 8 + ret void +} From 6c55f4ba576652e45361b7788bc3f3092a7cd56b Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 29 May 2023 23:49:46 -0700 Subject: [PATCH 048/704] [NFC][hwasan] Don't copy ThreadStartArg --- compiler-rt/lib/hwasan/hwasan_interceptors.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp index f50a42d9d94ea..4eb5210e1b457 100644 --- a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp +++ b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp @@ -224,8 +224,8 @@ struct ThreadStartArg { static void *HwasanThreadStartFunc(void *arg) { __hwasan_thread_enter(); - ThreadStartArg A = *reinterpret_cast(arg); - SetSigProcMask(&A.starting_sigset_, nullptr); + SetSigProcMask(&reinterpret_cast(arg)->starting_sigset_, + nullptr); InternalFree(arg); auto self = GetThreadSelf(); auto args = hwasanThreadArgRetval().GetArgs(self); From df37e2211ea34f128a53912835bcbc96147a1408 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 30 May 2023 06:44:31 +0000 Subject: [PATCH 049/704] [mlir][llvm] Add thread-local address intrinsic Add the `llvm.threadlocal.address` intrinsic to the LLVM dialect. Signed-off-by: Lukas Sommer Reviewed By: gysit, zero9178 Differential Revision: https://reviews.llvm.org/D151566 --- mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td | 5 +++++ mlir/test/Target/LLVMIR/Import/intrinsic.ll | 8 ++++++++ mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir | 8 ++++++++ 3 files changed, 21 insertions(+) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index a409223ade155..e032bcc47b376 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -324,6 +324,11 @@ def LLVM_ExpectWithProbabilityOp let assemblyFormat = "$val `,` $expected `,` $prob attr-dict `:` type($val)"; } +def LLVM_ThreadlocalAddressOp : LLVM_OneResultIntrOp<"threadlocal.address", [], + [0], [Pure]> { + let arguments = (ins LLVM_AnyPointer:$global); +} + // // Coroutine intrinsics. // diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll index e9b361509d037..3cc8b2f6fb785 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll @@ -613,6 +613,13 @@ define void @expect_with_probability(i16 %0) { ret void } +; CHECK-LABEL: llvm.func @threadlocal_test +define void @threadlocal_test(ptr %0) { + ; CHECK: "llvm.intr.threadlocal.address"(%{{.*}}) : (!llvm.ptr) -> !llvm.ptr + %local = call ptr @llvm.threadlocal.address.p0(ptr %0) + ret void +} + ; CHECK-LABEL: llvm.func @coro_id define void @coro_id(i32 %0, ptr %1) { ; CHECK: llvm.intr.coro.id %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : (i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> !llvm.token @@ -955,6 +962,7 @@ declare <8 x i32> @llvm.ushl.sat.v8i32(<8 x i32>, <8 x i32>) declare i1 @llvm.is.constant.i32(i32) declare i32 @llvm.expect.i32(i32, i32) declare i16 @llvm.expect.with.probability.i16(i16, i16, double immarg) +declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) declare ptr @llvm.coro.begin(token, ptr writeonly) declare i64 @llvm.coro.size.i64() diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index ec619b9a9d367..3e07ac90784ca 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -559,6 +559,13 @@ llvm.func @expect_with_probability(%arg0: i16) { llvm.return } +// CHECK-LABEL: @threadlocal_test +llvm.func @threadlocal_test(%arg0 : !llvm.ptr) { + // CHECK: call ptr @llvm.threadlocal.address.p0(ptr %{{.*}}) + "llvm.intr.threadlocal.address"(%arg0) : (!llvm.ptr) -> !llvm.ptr + llvm.return +} + // CHECK-LABEL: @sadd_sat_test llvm.func @sadd_sat_test(%arg0: i32, %arg1: i32, %arg2: vector<8xi32>, %arg3: vector<8xi32>) { // CHECK: call i32 @llvm.sadd.sat.i32 @@ -996,6 +1003,7 @@ llvm.func @lifetime(%p: !llvm.ptr) { // CHECK-DAG: declare i1 @llvm.is.constant.i32(i32) // CHECK-DAG: declare i32 @llvm.expect.i32(i32, i32) // CHECK-DAG: declare i16 @llvm.expect.with.probability.i16(i16, i16, double immarg) +// CHECK-DAG: declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) // CHECK-DAG: declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) // CHECK-DAG: declare ptr @llvm.coro.begin(token, ptr writeonly) // CHECK-DAG: declare i64 @llvm.coro.size.i64() From 5ddb623952cacba0a3933dacd4c70439ca95c70d Mon Sep 17 00:00:00 2001 From: Martin Braenne Date: Fri, 26 May 2023 09:52:55 +0000 Subject: [PATCH 050/704] [clang][dataflow] Remove unnecessary `ASTContext` parameter from `ControlFlowContext::build` overload. When introducing this new overload in https://reviews.llvm.org/D151183, I didn't consider that the `ASTContext` parameter was unnecessary because it could also be obtained from the `FunctionDecl`. Reviewed By: gribozavr2, xazax.hun Differential Revision: https://reviews.llvm.org/D151549 --- .../include/clang/Analysis/FlowSensitive/ControlFlowContext.h | 3 +-- clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp | 4 ++-- clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp | 2 +- clang/unittests/Analysis/FlowSensitive/TestingSupport.h | 2 +- .../Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h index f327011766069..bb36ed237c1e3 100644 --- a/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h +++ b/clang/include/clang/Analysis/FlowSensitive/ControlFlowContext.h @@ -33,8 +33,7 @@ class ControlFlowContext { public: /// Builds a ControlFlowContext from a `FunctionDecl`. /// `Func.hasBody()` must be true, and `Func.isTemplated()` must be false. - static llvm::Expected build(const FunctionDecl &Func, - ASTContext &C); + static llvm::Expected build(const FunctionDecl &Func); /// Builds a ControlFlowContext from an AST node. `D` is the function in which /// `S` resides. `D.isTemplated()` must be false. diff --git a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp index c62bff33524cf..c80525dc4f34f 100644 --- a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp +++ b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp @@ -68,13 +68,13 @@ static llvm::BitVector findReachableBlocks(const CFG &Cfg) { } llvm::Expected -ControlFlowContext::build(const FunctionDecl &Func, ASTContext &C) { +ControlFlowContext::build(const FunctionDecl &Func) { if (!Func.hasBody()) return llvm::createStringError( std::make_error_code(std::errc::invalid_argument), "Cannot analyze function without a body"); - return build(Func, *Func.getBody(), C); + return build(Func, *Func.getBody(), Func.getASTContext()); } llvm::Expected diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp index 32612397ec024..27ec15adc5350 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp @@ -211,7 +211,7 @@ DataflowAnalysisContext::getControlFlowContext(const FunctionDecl *F) { return &It->second; if (F->hasBody()) { - auto CFCtx = ControlFlowContext::build(*F, F->getASTContext()); + auto CFCtx = ControlFlowContext::build(*F); // FIXME: Handle errors. assert(CFCtx); auto Result = FunctionContexts.insert({F, std::move(*CFCtx)}); diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h index d5591bee12dc2..aa2b2a241b224 100644 --- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h +++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h @@ -241,7 +241,7 @@ checkDataflow(AnalysisInputs AI, llvm::errc::invalid_argument, "Could not find the target function."); // Build the control flow graph for the target function. - auto MaybeCFCtx = ControlFlowContext::build(*Target, Context); + auto MaybeCFCtx = ControlFlowContext::build(*Target); if (!MaybeCFCtx) return MaybeCFCtx.takeError(); auto &CFCtx = *MaybeCFCtx; diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp index 1d94b69cfce81..473750ad7a6cb 100644 --- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp @@ -65,7 +65,7 @@ runAnalysis(llvm::StringRef Code, AnalysisT (*MakeAnalysis)(ASTContext &)) { assert(Func != nullptr); auto CFCtx = - llvm::cantFail(ControlFlowContext::build(*Func, AST->getASTContext())); + llvm::cantFail(ControlFlowContext::build(*Func)); AnalysisT Analysis = MakeAnalysis(AST->getASTContext()); DataflowAnalysisContext DACtx(std::make_unique()); From 536b76e873c56994a7dc611a6081a7a79e9fb526 Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Tue, 30 May 2023 15:05:41 +0800 Subject: [PATCH 051/704] [NFC] [serialization] Refactor the outdated AbrrevToUse of VarDecl The implementation and the comment of the AbrrevToUse of VarDecl looks not consistent with the implementation. This patch refactors it. --- clang/lib/Serialization/ASTWriterDecl.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 54b5e3877782d..bd935472bcef4 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -1136,10 +1136,10 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) { !D->isConstexpr() && !D->isInitCapture() && !D->isPreviousDeclInSameBlockScope() && - !D->hasAttr() && !D->isEscapingByref() && !HasDeducedType && D->getStorageDuration() != SD_Static && + !D->getDescribedVarTemplate() && !D->getMemberSpecializationInfo()) AbbrevToUse = Writer.getDeclVarAbbrev(); @@ -2244,8 +2244,8 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(0)); // InitStyle Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isARCPseudoStrong Abv->Add(BitCodeAbbrevOp(0)); // Linkage - Abv->Add(BitCodeAbbrevOp(0)); // HasInit - Abv->Add(BitCodeAbbrevOp(0)); // HasMemberSpecializationInfo + Abv->Add(BitCodeAbbrevOp(0)); // ModulesCodegen + Abv->Add(BitCodeAbbrevOp(0)); // VarKind (local enum) // ParmVarDecl Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // IsObjCMethodParameter Abv->Add(BitCodeAbbrevOp(0)); // ScopeDepth @@ -2334,8 +2334,8 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(0)); // EscapingByref Abv->Add(BitCodeAbbrevOp(0)); // HasDeducedType Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // Linkage - Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // HasConstant* - Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 2)); // VarKind (local enum) + Abv->Add(BitCodeAbbrevOp(0)); // ModulesCodeGen + Abv->Add(BitCodeAbbrevOp(0)); // VarKind (local enum) // Type Source Info Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TypeLoc From b187215a44f764fe91237321b4f17caf9f434894 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 30 May 2023 00:20:18 -0700 Subject: [PATCH 052/704] Disable MLIR integration test that is failing on a bot, pending a LLVM backend fix --- .../test/Integration/Dialect/Vector/CPU/test-contraction.mlir | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir index 579dc86cad55b..400f207f37348 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir @@ -1,4 +1,6 @@ -// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ +// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts +// Disable the execution for now because of LLVM backend bug: https://github.com/llvm/llvm-project/issues/62995 +// | \ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_c_runner_utils | \ // RUN: FileCheck %s From 1d9a1139fd2c29189f2e2b9b149dfbd1a6b931bb Mon Sep 17 00:00:00 2001 From: Alex Zinenko Date: Fri, 26 May 2023 15:50:59 +0000 Subject: [PATCH 053/704] [mlir] harden expensive-checks mode against ops with repeated operands Transform operations may indicate that they may accept and consume several handles pointing to the same or nested payload entities. The initial implementation of the expensive-checks mode was simply ignoring such cases as consuming the second handle would fail the check after the first handle invalidated it by consuming the same payload. Additional checks had been added since then, which could now trigger assertions in the expensive-checks module itself (instead of or in addition to use-after-free assertions down the road), specifically because the payload associations for invalidated handles is removed from the state to enable other kinds of checking. Rework the handling of transform operations with repeated handles so use-after-consume is still reported properly if the consumption happened by a preceding operation, as opposed to the a preceding operand of the same operation that is still (corretly) ignored if the op requests that. Depends on: D151560 Reviewed By: springerm Differential Revision: https://reviews.llvm.org/D151569 --- .../Transform/IR/TransformInterfaces.h | 104 +++++++++++++--- .../Transform/IR/TransformInterfaces.cpp | 111 ++++++++++++------ .../Dialect/Transform/expensive-checks.mlir | 22 ++++ .../TestTransformDialectExtension.cpp | 4 + .../TestTransformDialectExtension.td | 5 +- 5 files changed, 192 insertions(+), 54 deletions(-) diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.h b/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.h index 4c07791b67c84..fc1ffebf4a321 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.h +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.h @@ -153,6 +153,10 @@ class TransformState { /// values in the payload IR. Also works for reverse mappings. using ValueMapping = DenseMap>; + /// Mapping between a Value in the transform IR and an error message that + /// should be emitted when the value is used. + using InvalidatedHandleMap = DenseMap>; + /// The bidirectional mappings between transform IR values and payload IR /// operations, and the mapping between transform IR values and parameters. struct Mappings { @@ -567,26 +571,85 @@ class TransformState { /// handle. LogicalResult replacePayloadValue(Value value, Value replacement); - /// If the operand is a handle consumed by the operation, i.e. has the "free" - /// memory effect associated with it, identifies other handles that are - /// pointing to payload IR operations nested in the operations pointed to by - /// the consumed handle. Marks all such handles as invalidated to trigger - /// errors if they are used. If `throughValue` is passed, record the fact that - /// an op handle was invalidated because a value handle associated with - /// results of the payload op or its block arguments was invalidated. + /// Records handle invalidation reporters into `newlyInvalidated`. + /// Specifically, + /// - `handle` is the op operand that consumes the handle, + /// - `potentialAncestors` is a list of ancestors of the payload operation + /// that the consumed handle is associated with, including itself, + /// - `throughValue` is the payload value the handle to which is consumed, + /// when it is the case, null when the operation handle is consumed + /// directly. + /// Iterates over all known operation and value handles and records reporters + /// for any potential future use of `handle` or any other handle that is + /// invalidated by its consumption, i.e., any handle pointing to any payload + /// IR entity (operation or value) associated with the same payload IR entity + /// as the consumed handle, or any nested payload IR entity. If + /// `potentialAncestors` is empty, records the reporter anyway. Does not + /// override existing reporters. This must remain a const method so it doesn't + /// inadvertently mutate `invalidatedHandles` too early. void recordOpHandleInvalidation(OpOperand &consumingHandle, ArrayRef potentialAncestors, - Value throughValue = nullptr); - void recordOpHandleInvalidationOne(OpOperand &handle, - ArrayRef potentialAncestors, - Operation *payloadOp, Value otherHandle, - Value throughValue = nullptr); - + Value throughValue, + InvalidatedHandleMap &newlyInvalidated) const; + + /// Records handle invalidation reporters into `newlyInvalidated`. + /// Specifically, + /// - `consumingHandle` is the op operand that consumes the handle, + /// - `potentialAncestors` is a list of ancestors of the payload operation + /// that the consumed handle is associated with, including itself, + /// - `payloadOp` is the operation itself, + /// - `otherHandle` is another that may be associated with the affected + /// payload operations + /// - `throughValue` is the payload value the handle to which is consumed, + /// when it is the case, null when the operation handle is consumed + /// directly. + /// Looks at the payload opreations associated with `otherHandle` and if any + /// of these operations has an ancestor (or is itself) listed in + /// `potentialAncestors`, records the error message describing the use of the + /// invalidated handle. Does nothing if `otherHandle` already has a reporter + /// associated with it. This must remain a const method so it doesn't + /// inadvertently mutate `invalidatedHandles` too early. + void recordOpHandleInvalidationOne( + OpOperand &consumingHandle, ArrayRef potentialAncestors, + Operation *payloadOp, Value otherHandle, Value throughValue, + InvalidatedHandleMap &newlyInvalidated) const; + + /// Records handle invalidation reporters into `newlyInvalidated`. + /// Specifically, + /// - `opHandle` is the op operand that consumes the handle; + /// - `potentialAncestors` is a list of ancestors of the payload operation + /// that the consumed handle is associated with, including itself; + /// - `payloadValue` is the value defined by the operation associated with + /// the consuming handle as either op result or block argument; + /// - `valueHandle` is another that may be associated with the payload value. + /// Looks at the payload values associated with `valueHandle` and if any of + /// these values is defined, as op result or block argument, by an operation + /// whose ancestor (or the operation itself) is listed in + /// `potentialAncestors`, records the error message describing the use of the + /// invalidated handle. Does nothing if `valueHandle` already has a reporter + /// associated with it. This must remain a const method so it doesn't + /// inadvertently mutate `invalidatedHandles` too early. void recordValueHandleInvalidationByOpHandleOne( OpOperand &opHandle, ArrayRef potentialAncestors, - Value payloadValue, Value valueHandle); - - void recordValueHandleInvalidation(OpOperand &valueHandle); + Value payloadValue, Value valueHandle, + InvalidatedHandleMap &newlyInvalidated) const; + + /// Records handle invalidation reporters into `newlyInvalidated`. + /// Specifically, + /// - `valueHandle` is the op operand that consumes the handle, + /// - `throughValue` is the payload value the handle to which is consumed, + /// when it is the case, null when the operation handle is consumed + /// directly. + /// Iterates over all known operation and value handles and records reporters + /// for any potential future use of `handle` or any other handle that is + /// invalidated by its consumption, i.e., any handle pointing to any payload + /// IR entity (operation or value) associated with the same payload IR entity + /// as the consumed handle, or any nested payload IR entity. Does not override + /// existing reporters. This must remain a const method so it doesn't + /// inadvertently mutate `invalidatedHandles` too early. + void + recordValueHandleInvalidation(OpOperand &valueHandle, + InvalidatedHandleMap &newlyInvalidated) const; /// Checks that the operation does not use invalidated handles as operands. /// Reports errors and returns failure if it does. Otherwise, invalidates the @@ -596,6 +659,13 @@ class TransformState { LogicalResult checkAndRecordHandleInvalidation(TransformOpInterface transform); + /// Implementation of the checkAndRecordHandleInvalidation. This must remain a + /// const method so it doesn't inadvertently mutate `invalidatedHandles` too + /// early. + LogicalResult checkAndRecordHandleInvalidationImpl( + transform::TransformOpInterface transform, + transform::TransformState::InvalidatedHandleMap &newlyInvalidated) const; + /// Remove all nullptrs from op handles that were added by `replacePayloadOp`. void compactOpHandles(); @@ -628,7 +698,7 @@ class TransformState { /// describe when the handles were invalidated. Calling such a function emits /// a user-visible diagnostic with an additional note pointing to the given /// location. - DenseMap> invalidatedHandles; + InvalidatedHandleMap invalidatedHandles; #if LLVM_ENABLE_ABI_BREAKING_CHECKS /// A stack of nested regions that are being processed in the transform IR. diff --git a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp index 85535c77865c1..b1dc66892c4c7 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp @@ -431,10 +431,13 @@ transform::TransformState::replacePayloadValue(Value value, Value replacement) { void transform::TransformState::recordOpHandleInvalidationOne( OpOperand &consumingHandle, ArrayRef potentialAncestors, - Operation *payloadOp, Value otherHandle, Value throughValue) { + Operation *payloadOp, Value otherHandle, Value throughValue, + transform::TransformState::InvalidatedHandleMap &newlyInvalidated) const { // If the op is associated with invalidated handle, skip the check as it - // may be reading invalid IR. - if (invalidatedHandles.count(otherHandle)) + // may be reading invalid IR. This also ensures we report the first + // invalidation and not the last one. + if (invalidatedHandles.count(otherHandle) || + newlyInvalidated.count(otherHandle)) return; FULL_LDBG("--recordOpHandleInvalidationOne\n"); @@ -467,9 +470,9 @@ void transform::TransformState::recordOpHandleInvalidationOne( Location opLoc = payloadOp->getLoc(); std::optional throughValueLoc = throughValue ? std::make_optional(throughValue.getLoc()) : std::nullopt; - invalidatedHandles[otherHandle] = [ancestorLoc, opLoc, owner, operandNo, - otherHandle, - throughValueLoc](Location currentLoc) { + newlyInvalidated[otherHandle] = [ancestorLoc, opLoc, owner, operandNo, + otherHandle, + throughValueLoc](Location currentLoc) { InFlightDiagnostic diag = emitError(currentLoc) << "op uses a handle invalidated by a " "previously executed transform op"; @@ -490,11 +493,14 @@ void transform::TransformState::recordOpHandleInvalidationOne( } void transform::TransformState::recordValueHandleInvalidationByOpHandleOne( - OpOperand &consumingHandle, ArrayRef potentialAncestors, - Value payloadValue, Value valueHandle) { + OpOperand &opHandle, ArrayRef potentialAncestors, + Value payloadValue, Value valueHandle, + transform::TransformState::InvalidatedHandleMap &newlyInvalidated) const { // If the op is associated with invalidated handle, skip the check as it - // may be reading invalid IR. - if (invalidatedHandles.count(valueHandle)) + // may be reading invalid IR. This also ensures we report the first + // invalidation and not the last one. + if (invalidatedHandles.count(valueHandle) || + newlyInvalidated.count(valueHandle)) return; for (Operation *ancestor : potentialAncestors) { @@ -517,12 +523,12 @@ void transform::TransformState::recordValueHandleInvalidationByOpHandleOne( if (!ancestor->isAncestor(definingOp)) continue; - Operation *owner = consumingHandle.getOwner(); - unsigned operandNo = consumingHandle.getOperandNumber(); + Operation *owner = opHandle.getOwner(); + unsigned operandNo = opHandle.getOperandNumber(); Location ancestorLoc = ancestor->getLoc(); Location opLoc = definingOp->getLoc(); Location valueLoc = payloadValue.getLoc(); - invalidatedHandles[valueHandle] = + newlyInvalidated[valueHandle] = [valueHandle, owner, operandNo, resultNo, argumentNo, blockNo, regionNo, ancestorLoc, opLoc, valueLoc](Location currentLoc) { InFlightDiagnostic diag = emitError(currentLoc) @@ -551,7 +557,8 @@ void transform::TransformState::recordValueHandleInvalidationByOpHandleOne( void transform::TransformState::recordOpHandleInvalidation( OpOperand &handle, ArrayRef potentialAncestors, - Value throughValue) { + Value throughValue, + transform::TransformState::InvalidatedHandleMap &newlyInvalidated) const { if (potentialAncestors.empty()) { DEBUG_WITH_TYPE(DEBUG_TYPE_FULL, { @@ -561,7 +568,7 @@ void transform::TransformState::recordOpHandleInvalidation( Operation *owner = handle.getOwner(); unsigned operandNo = handle.getOperandNumber(); - invalidatedHandles[handle.get()] = [owner, operandNo](Location currentLoc) { + newlyInvalidated[handle.get()] = [owner, operandNo](Location currentLoc) { InFlightDiagnostic diag = emitError(currentLoc) << "op uses a handle associated with empty " "payload and invalidated by a " @@ -580,14 +587,16 @@ void transform::TransformState::recordOpHandleInvalidation( // number of IR objects (operations and values). Alternatively, we could walk // the IR nested in each payload op associated with the given handle and look // for handles associated with each operation and value. - for (const Mappings &mapping : llvm::make_second_range(mappings)) { + for (const transform::TransformState::Mappings &mapping : + llvm::make_second_range(mappings)) { // Go over all op handle mappings and mark as invalidated any handle // pointing to any of the payload ops associated with the given handle or // any op nested in them. for (const auto &[payloadOp, otherHandles] : mapping.reverse) { for (Value otherHandle : otherHandles) recordOpHandleInvalidationOne(handle, potentialAncestors, payloadOp, - otherHandle, throughValue); + otherHandle, throughValue, + newlyInvalidated); } // Go over all value handle mappings and mark as invalidated any handle // pointing to any result of the payload op associated with the given handle @@ -597,13 +606,15 @@ void transform::TransformState::recordOpHandleInvalidation( for (const auto &[payloadValue, valueHandles] : mapping.reverseValues) { for (Value valueHandle : valueHandles) recordValueHandleInvalidationByOpHandleOne(handle, potentialAncestors, - payloadValue, valueHandle); + payloadValue, valueHandle, + newlyInvalidated); } } } void transform::TransformState::recordValueHandleInvalidation( - OpOperand &valueHandle) { + OpOperand &valueHandle, + transform::TransformState::InvalidatedHandleMap &newlyInvalidated) const { // Invalidate other handles to the same value. for (Value payloadValue : getPayloadValues(valueHandle.get())) { SmallVector otherValueHandles; @@ -612,8 +623,8 @@ void transform::TransformState::recordValueHandleInvalidation( Operation *owner = valueHandle.getOwner(); unsigned operandNo = valueHandle.getOperandNumber(); Location valueLoc = payloadValue.getLoc(); - invalidatedHandles[otherHandle] = [otherHandle, owner, operandNo, - valueLoc](Location currentLoc) { + newlyInvalidated[otherHandle] = [otherHandle, owner, operandNo, + valueLoc](Location currentLoc) { InFlightDiagnostic diag = emitError(currentLoc) << "op uses a handle invalidated by a " "previously executed transform op"; @@ -629,17 +640,24 @@ void transform::TransformState::recordValueHandleInvalidation( if (auto opResult = llvm::dyn_cast(payloadValue)) { Operation *payloadOp = opResult.getOwner(); - recordOpHandleInvalidation(valueHandle, payloadOp, payloadValue); + recordOpHandleInvalidation(valueHandle, payloadOp, payloadValue, + newlyInvalidated); } else { auto arg = llvm::dyn_cast(payloadValue); for (Operation &payloadOp : *arg.getOwner()) - recordOpHandleInvalidation(valueHandle, &payloadOp, payloadValue); + recordOpHandleInvalidation(valueHandle, &payloadOp, payloadValue, + newlyInvalidated); } } } -LogicalResult transform::TransformState::checkAndRecordHandleInvalidation( - TransformOpInterface transform) { +/// Checks that the operation does not use invalidated handles as operands. +/// Reports errors and returns failure if it does. Otherwise, invalidates the +/// handles consumed by the operation as well as any handles pointing to payload +/// IR operations nested in the operations associated with the consumed handles. +LogicalResult transform::TransformState::checkAndRecordHandleInvalidationImpl( + transform::TransformOpInterface transform, + transform::TransformState::InvalidatedHandleMap &newlyInvalidated) const { FULL_LDBG("--Start checkAndRecordHandleInvalidation\n"); auto memoryEffectsIface = cast(transform.getOperation()); @@ -651,13 +669,23 @@ LogicalResult transform::TransformState::checkAndRecordHandleInvalidation( DEBUG_WITH_TYPE(DEBUG_TYPE_FULL, { (DBGS() << "----iterate on handle: " << target.get() << "\n"); }); - // If the operand uses an invalidated handle, report it. + // If the operand uses an invalidated handle, report it. If the operation + // allows handles to point to repeated payload operations, only report + // pre-existing invalidation errors. Otherwise, also report invalidations + // caused by the current transform operation affecting its other operands. auto it = invalidatedHandles.find(target.get()); - if (!transform.allowsRepeatedHandleOperands() && - it != invalidatedHandles.end()) { - FULL_LDBG("--End checkAndRecordHandleInvalidation -> FAILURE\n"); + auto nit = newlyInvalidated.find(target.get()); + if (it != invalidatedHandles.end()) { + FULL_LDBG("--End checkAndRecordHandleInvalidation, found already " + "invalidated -> FAILURE\n"); return it->getSecond()(transform->getLoc()), failure(); } + if (!transform.allowsRepeatedHandleOperands() && + nit != newlyInvalidated.end()) { + FULL_LDBG("--End checkAndRecordHandleInvalidation, found newly " + "invalidated (by this op) -> FAILURE\n"); + return nit->getSecond()(transform->getLoc()), failure(); + } // Invalidate handles pointing to the operations nested in the operation // associated with the handle consumed by this operation. @@ -666,15 +694,18 @@ LogicalResult transform::TransformState::checkAndRecordHandleInvalidation( effect.getValue() == target.get(); }; if (llvm::any_of(effects, consumesTarget)) { - FULL_LDBG("----found consume effect -> SKIP\n"); - if (llvm::isa(target.get().getType())) { + FULL_LDBG("----found consume effect\n"); + if (llvm::isa( + target.get().getType())) { FULL_LDBG("----recordOpHandleInvalidation\n"); - ArrayRef payloadOps = getPayloadOpsView(target.get()); - recordOpHandleInvalidation(target, payloadOps); - } else if (llvm::isa( + SmallVector payloadOps = + llvm::to_vector(getPayloadOps(target.get())); + recordOpHandleInvalidation(target, payloadOps, nullptr, + newlyInvalidated); + } else if (llvm::isa( target.get().getType())) { FULL_LDBG("----recordValueHandleInvalidation\n"); - recordValueHandleInvalidation(target); + recordValueHandleInvalidation(target, newlyInvalidated); } else { FULL_LDBG("----not a TransformHandle -> SKIP AND DROP ON THE FLOOR\n"); } @@ -687,6 +718,16 @@ LogicalResult transform::TransformState::checkAndRecordHandleInvalidation( return success(); } +LogicalResult transform::TransformState::checkAndRecordHandleInvalidation( + transform::TransformOpInterface transform) { + InvalidatedHandleMap newlyInvalidated; + LogicalResult checkResult = + checkAndRecordHandleInvalidationImpl(transform, newlyInvalidated); + invalidatedHandles.insert(std::make_move_iterator(newlyInvalidated.begin()), + std::make_move_iterator(newlyInvalidated.end())); + return checkResult; +} + template DiagnosedSilenceableFailure checkRepeatedConsumptionInOperand(ArrayRef payload, diff --git a/mlir/test/Dialect/Transform/expensive-checks.mlir b/mlir/test/Dialect/Transform/expensive-checks.mlir index 4cbaad87331d5..e35c1791da939 100644 --- a/mlir/test/Dialect/Transform/expensive-checks.mlir +++ b/mlir/test/Dialect/Transform/expensive-checks.mlir @@ -342,3 +342,25 @@ transform.sequence failures(propagate) { // expected-error @below {{uses a handle associated with empty payload and invalidated by a previously executed transform op}} transform.test_print_remark_at_operand %0, "remark" : !transform.any_op } + +// ----- + +// Make sure we properly report a use-after-consume error when repeated handles +// are allowed in the consuming op. We still want to report handles consumed by +// _previous_ operations, just not by this one. To bypass the quick static check +// of repeated consumption, create a handle to the transform operation and +// invalidate the handle to the root module thus invalidating all other handles. + +// expected-note @below {{ancestor payload op}} +module { + transform.sequence failures(propagate) { + ^bb0(%arg0: !transform.any_op): + // expected-note @below {{handle to invalidated ops}} + // expected-note @below {{nested payload op}} + %0 = transform.test_produce_self_handle_or_forward_operand : () -> !transform.any_op + // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}} + transform.test_consume_operand %arg0 : !transform.any_op + // expected-error @below {{uses a handle invalidated by a previously executed transform op}} + transform.test_consume_operand %0 { allow_repeated_handles } : !transform.any_op + } +} diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp index 5bf488e579981..f3b6c19d90b16 100644 --- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp +++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp @@ -178,6 +178,10 @@ void mlir::test::TestProduceValueHandleToArgumentOfParentBlock::getEffects( transform::onlyReadsPayload(effects); } +bool mlir::test::TestConsumeOperand::allowsRepeatedHandleOperands() { + return getAllowRepeatedHandles(); +} + DiagnosedSilenceableFailure mlir::test::TestConsumeOperand::apply(transform::TransformResults &results, transform::TransformState &state) { diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td index b1129ea5980cb..c02e2d97663d1 100644 --- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td +++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td @@ -97,11 +97,12 @@ def TestProduceValueHandleToArgumentOfParentBlock } def TestConsumeOperand : Op, + [DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let arguments = (ins Transform_AnyHandleOrParamType:$operand, - Optional:$second_operand); + Optional:$second_operand, + UnitAttr:$allow_repeated_handles); let assemblyFormat = "$operand (`,` $second_operand^)? attr-dict `:` type($operand)" "(`,` type($second_operand)^)?"; From e256f552c81527712cc3dcf245ca606c43742915 Mon Sep 17 00:00:00 2001 From: David Candler Date: Tue, 30 May 2023 08:57:27 +0100 Subject: [PATCH 054/704] [builtins][test] Skip scalbn rounding tests on newlib The picolib/newlib implementation of scalbn gives slightly different results compared to glibc and compiler-rt's inlined __compiler_rt_scalbn in certain rounding modes. Since these tests are already disabled for msvc which doesn't respect the mode change, this patch skips them for newlib as well. Reviewed By: MaskRay Differential Revision: https://reviews.llvm.org/D150280 --- compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c | 6 +++++- compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c b/compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c index e1cda63246fd0..990e7b947cb2e 100644 --- a/compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c +++ b/compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c @@ -63,7 +63,11 @@ int main() { defined(__i386__) || defined(__x86_64__) // Skip these tests for MSVC because its scalbn function always behaves as if // the default rounding mode is set (FE_TONEAREST). -#ifndef _MSC_VER +// Also skip for newlib because although its scalbn function does respect the +// rounding mode, where the tests trigger an underflow or overflow using a +// large exponent the result is rounded in the opposite direction to that which +// would be expected in the (FE_UPWARD) and (FE_DOWNWARD) modes. +# if !defined(_MSC_VER) && !defined(_NEWLIB_VERSION) fesetround(FE_UPWARD); if (iterate_cases("FE_UPWARD")) return 1; diff --git a/compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c b/compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c index dfa34232604fe..9edfe4aa1de3e 100644 --- a/compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c +++ b/compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c @@ -62,7 +62,11 @@ int main() { defined(__i386__) || defined(__x86_64__) // Skip these tests for MSVC because its scalbnf function always behaves as if // the default rounding mode is set (FE_TONEAREST). -#ifndef _MSC_VER +// Also skip for newlib because although its scalbnf function does respect the +// rounding mode, where the tests trigger an underflow or overflow using a +// large exponent the result is rounded in the opposite direction to that which +// would be expected in the (FE_UPWARD) and (FE_DOWNWARD) modes. +# if !defined(_MSC_VER) && !defined(_NEWLIB_VERSION) fesetround(FE_UPWARD); if (iterate_cases("FE_UPWARD")) return 1; From c7592c7714c2796715e6460fc5fd19e5a930c427 Mon Sep 17 00:00:00 2001 From: Nicolas Vasilache Date: Tue, 30 May 2023 00:32:32 -0700 Subject: [PATCH 055/704] [mlir][scf] NFC - Add debug information to scf pipelining --- .../Dialect/SCF/Transforms/LoopPipelining.cpp | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp index a85985b84a037..9b673d6f1de93 100644 --- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp @@ -20,6 +20,11 @@ #include "mlir/Support/MathExtras.h" #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/MapVector.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "scf-loop-pipelining" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") using namespace mlir; using namespace mlir::scf; @@ -84,26 +89,33 @@ struct LoopPipelinerInternal { bool LoopPipelinerInternal::initializeLoopInfo( ForOp op, const PipeliningOption &options) { + LDBG("Start initializeLoopInfo"); forOp = op; auto upperBoundCst = forOp.getUpperBound().getDefiningOp(); auto lowerBoundCst = forOp.getLowerBound().getDefiningOp(); auto stepCst = forOp.getStep().getDefiningOp(); - if (!upperBoundCst || !lowerBoundCst || !stepCst) + if (!upperBoundCst || !lowerBoundCst || !stepCst) { + LDBG("--no constant bounds or step -> BAIL"); return false; + } ub = upperBoundCst.value(); lb = lowerBoundCst.value(); step = stepCst.value(); peelEpilogue = options.peelEpilogue; predicateFn = options.predicateFn; - if (!peelEpilogue && predicateFn == nullptr) + if (!peelEpilogue && predicateFn == nullptr) { + LDBG("--no epilogue or predicate set -> BAIL"); return false; + } int64_t numIteration = ceilDiv(ub - lb, step); std::vector> schedule; options.getScheduleFn(forOp, schedule); - if (schedule.empty()) + if (schedule.empty()) { + LDBG("--empty schedule -> BAIL"); return false; + } opOrder.reserve(schedule.size()); for (auto &opSchedule : schedule) { @@ -111,13 +123,16 @@ bool LoopPipelinerInternal::initializeLoopInfo( stages[opSchedule.first] = opSchedule.second; opOrder.push_back(opSchedule.first); } - if (numIteration <= maxStage) + if (numIteration <= maxStage) { + LDBG("--fewer loop iterations than pipeline stages -> BAIL"); return false; + } // All operations need to have a stage. for (Operation &op : forOp.getBody()->without_terminator()) { if (!stages.contains(&op)) { op.emitOpError("not assigned a pipeline stage"); + LDBG("--op not assigned a pipeline stage: " << op << " -> BAIL"); return false; } } @@ -129,11 +144,15 @@ bool LoopPipelinerInternal::initializeLoopInfo( (void)stageNum; if (op == forOp.getBody()->getTerminator()) { op->emitError("terminator should not be assigned a stage"); + LDBG("--terminator should not be assigned stage: " << *op << " -> BAIL"); return false; } if (op->getBlock() != forOp.getBody()) { op->emitOpError("the owning Block of all operations assigned a stage " "should be the loop body block"); + LDBG("--the owning Block of all operations assigned a stage " + "should be the loop body block: " + << *op << " -> BAIL"); return false; } } @@ -145,8 +164,10 @@ bool LoopPipelinerInternal::initializeLoopInfo( [this](Value operand) { Operation *def = operand.getDefiningOp(); return !def || !stages.contains(def); - })) + })) { + LDBG("--only support loop carried dependency with a distance of 1 -> BAIL"); return false; + } annotateFn = options.annotateFn; return true; } From 6130c9df99a7a7eb9c6adc118a48f8f2acc534ab Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 30 May 2023 01:20:22 -0700 Subject: [PATCH 056/704] Use UNSUPPORTED annotation to disable Integration/Dialect/Vector/CPU/test-contraction.mlir Previous attempt to disable the test didn't seem to work as expected. --- .../Integration/Dialect/Vector/CPU/test-contraction.mlir | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir index 400f207f37348..315c99ba915f1 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir @@ -1,10 +1,9 @@ -// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts -// Disable the execution for now because of LLVM backend bug: https://github.com/llvm/llvm-project/issues/62995 -// | \ +// RUN: mlir-opt %s -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_c_runner_utils | \ // RUN: FileCheck %s - +// Disable the execution for now because of LLVM backend bug: https://github.com/llvm/llvm-project/issues/62995 +// UNSUPPORTED: target={{.*}} #dotp_accesses = [ affine_map<(i) -> (i)>, affine_map<(i) -> (i)>, From 898b880308f1ce31520c939ab19366dc3b82c930 Mon Sep 17 00:00:00 2001 From: Muhammad Omair Javaid Date: Tue, 30 May 2023 13:05:45 +0400 Subject: [PATCH 057/704] [LLDB] Update AArch64/Windows XFAIl decorators on TestNamespace.py --- lldb/test/API/lang/cpp/namespace/TestNamespace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/lang/cpp/namespace/TestNamespace.py b/lldb/test/API/lang/cpp/namespace/TestNamespace.py index 1dc9d00fcd993..3006699b6623a 100644 --- a/lldb/test/API/lang/cpp/namespace/TestNamespace.py +++ b/lldb/test/API/lang/cpp/namespace/TestNamespace.py @@ -11,6 +11,7 @@ class NamespaceBreakpointTestCase(TestBase): @expectedFailureAll(bugnumber="llvm.org/pr28548", compiler="gcc") + @expectedFailureAll(oslist=["windows"]) def test_breakpoints_func_auto(self): """Test that we can set breakpoints correctly by basename to find all functions whose basename is "func".""" self.build() @@ -37,7 +38,6 @@ def test_breakpoints_func_auto(self): ) @expectedFailureAll(bugnumber="llvm.org/pr28548", compiler="gcc") - @expectedFailureAll(oslist=["windows"]) def test_breakpoints_func_full(self): """Test that we can set breakpoints correctly by fullname to find all functions whose fully qualified name is "func" (no namespaces).""" From 9c561e8f3c2e8292bce9d7b36657144ba26a1c91 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Tue, 30 May 2023 12:03:01 +0300 Subject: [PATCH 058/704] [clang] Add test for CWG1397 Resolution of this CWG breaks potential dependency loop between complete-class context of non-static data member initializer (NSDMI), and defaulted default constructor, which is `noexcept` depending on NSDMIs among other things. For whatever reason in C++11 mode we issue an additional note and a different line number for the primary error. But I find the message itself even worse than aforementioned issues. It describes what's going on, but doesn't say what's bad about it. I find the previous version of this message more clear: https://github.com/llvm/llvm-project/commit/8dbc6b26171167b8ddf66a5f4b6d6fb9baf28336 . Created an issue for that: #62823 Reviewed By: #clang-language-wg, shafik Differential Revision: https://reviews.llvm.org/D151034 --- clang/test/CXX/drs/dr13xx.cpp | 17 +++++++++++++++++ clang/www/cxx_dr_status.html | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/clang/test/CXX/drs/dr13xx.cpp b/clang/test/CXX/drs/dr13xx.cpp index 65eb8a293276f..feaf523c44fc2 100644 --- a/clang/test/CXX/drs/dr13xx.cpp +++ b/clang/test/CXX/drs/dr13xx.cpp @@ -480,6 +480,23 @@ namespace dr1395 { // dr1395: 16 #endif } +namespace dr1397 { // dr1397: 3.2 +#if __cplusplus >= 201103L +struct A { // #dr1397-struct-A + void *p = A{}; // #dr1397-void-p +#if __cplusplus == 201103L + // expected-error@#dr1397-struct-A {{default member initializer for 'p' needed within definition of enclosing class 'A' outside of member functions}} + // expected-note@#dr1397-void-p {{in evaluation of exception specification for 'dr1397::A::A' needed here}} + // expected-note@#dr1397-void-p {{default member initializer declared here}} +#elif __cplusplus >= 201402L + // expected-error@#dr1397-void-p {{default member initializer for 'p' needed within definition of enclosing class 'A' outside of member functions}} + // expected-note@#dr1397-void-p {{default member initializer declared here}} +#endif + operator void*() const { return nullptr; } +}; +#endif +} // namespace dr1397 + namespace dr1399 { // dr1399: dup 1388 template void f(T..., int, T...) {} // expected-note {{candidate}} expected-error 0-1{{C++11}} void g() { diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 59a0b0c05295c..ec2ac24450832 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -8189,7 +8189,7 @@

C++ defect report implementation status

1397 CD4 Class completeness in non-static data member initializers - Unknown + Clang 3.2 1398 From 993060e1d31d07e9c44e7164d24f9f495197ca87 Mon Sep 17 00:00:00 2001 From: tripleCC Date: Tue, 30 May 2023 11:20:05 +0200 Subject: [PATCH 059/704] [StaticAnalyzer] Fix block pointer type nullability check This patch fixes a false negative when the property type is an objective-c block pointer. Patch By tripleCC! Differential Revision: https://reviews.llvm.org/D151651 --- .../Checkers/NullabilityChecker.cpp | 18 +++++++++++------- clang/test/Analysis/nullability.mm | 13 +++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp index da8529f4ea813..11d5e77db0c73 100644 --- a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp @@ -306,6 +306,10 @@ static NullConstraint getNullConstraint(DefinedOrUnknownSVal Val, return NullConstraint::Unknown; } +static bool isValidPointerType(QualType T) { + return T->isAnyPointerType() || T->isBlockPointerType(); +} + const SymbolicRegion * NullabilityChecker::getTrackRegion(SVal Val, bool CheckSuperRegion) const { if (!NeedTracking) @@ -621,7 +625,7 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S, if (!RetExpr) return; - if (!RetExpr->getType()->isAnyPointerType()) + if (!isValidPointerType(RetExpr->getType())) return; ProgramStateRef State = C.getState(); @@ -754,7 +758,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call, if (!ArgSVal) continue; - if (!Param->getType()->isAnyPointerType() && + if (!isValidPointerType(Param->getType()) && !Param->getType()->isReferenceType()) continue; @@ -841,7 +845,7 @@ void NullabilityChecker::checkPostCall(const CallEvent &Call, if (!FuncType) return; QualType ReturnType = FuncType->getReturnType(); - if (!ReturnType->isAnyPointerType()) + if (!isValidPointerType(ReturnType)) return; ProgramStateRef State = C.getState(); if (State->get()) @@ -935,7 +939,7 @@ void NullabilityChecker::checkPostObjCMessage(const ObjCMethodCall &M, if (!Decl) return; QualType RetType = Decl->getReturnType(); - if (!RetType->isAnyPointerType()) + if (!isValidPointerType(RetType)) return; ProgramStateRef State = C.getState(); @@ -1089,9 +1093,9 @@ void NullabilityChecker::checkPostStmt(const ExplicitCastExpr *CE, CheckerContext &C) const { QualType OriginType = CE->getSubExpr()->getType(); QualType DestType = CE->getType(); - if (!OriginType->isAnyPointerType()) + if (!isValidPointerType(OriginType)) return; - if (!DestType->isAnyPointerType()) + if (!isValidPointerType(DestType)) return; ProgramStateRef State = C.getState(); @@ -1215,7 +1219,7 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S, return; QualType LocType = TVR->getValueType(); - if (!LocType->isAnyPointerType()) + if (!isValidPointerType(LocType)) return; ProgramStateRef State = C.getState(); diff --git a/clang/test/Analysis/nullability.mm b/clang/test/Analysis/nullability.mm index f9b3fc60c5a02..44c241e07ee50 100644 --- a/clang/test/Analysis/nullability.mm +++ b/clang/test/Analysis/nullability.mm @@ -46,10 +46,13 @@ - (int *_Nonnull)returnsNonnull; - (int *_Nullable)returnsNullable; - (int *)returnsUnspecified; - (void)takesNonnull:(int *_Nonnull)p; +- (void)takesNonnullBlock:(void (^ _Nonnull)(void))block; - (void)takesNullable:(int *_Nullable)p; - (void)takesUnspecified:(int *)p; @property(readonly, strong) NSString *stuff; @property(readonly, nonnull) int *propReturnsNonnull; +@property(readonly, nonnull) void (^propReturnsNonnullBlock)(void); +@property(readonly, nullable) void (^propReturnsNullableBlock)(void); @property(readonly, nullable) int *propReturnsNullable; @property(readonly) int *propReturnsUnspecified; @end @@ -65,6 +68,7 @@ - (void)takesUnspecified:(int *)p; void takesNullable(Dummy *_Nullable); void takesNonnull(Dummy *_Nonnull); void takesUnspecified(Dummy *); +void takesNonnullBlock(void (^ _Nonnull)(void)); Dummy *_Nullable returnsNullable(); Dummy *_Nonnull returnsNonnull(); @@ -197,6 +201,7 @@ void testObjCPropertyReadNullability() { switch (getRandom()) { case 0: [o takesNonnull:o.propReturnsNonnull]; // no-warning + [o takesNonnullBlock:o.propReturnsNonnullBlock]; // no-warning break; case 1: [o takesNonnull:o.propReturnsUnspecified]; // no-warning @@ -236,6 +241,9 @@ void testObjCPropertyReadNullability() { assert(o.propReturnsNullable); [o takesNonnull:o.propReturnsNullable]; // no-warning break; + case 8: + [o takesNonnullBlock:o.propReturnsNullableBlock]; // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}} + break; } } @@ -308,6 +316,11 @@ void testIndirectNilPassToNonnull() { takesNonnull(p); // expected-warning {{Null passed to a callee that requires a non-null 1st parameter}} } +void testBlockIndirectNilPassToNonnull() { + void (^p)(void) = nil; + takesNonnullBlock(p); // expected-warning {{Null passed to a callee that requires a non-null 1st parameter}} +} + void testConditionalNilPassToNonnull(Dummy *p) { if (!p) { takesNonnull(p); // expected-warning {{Null passed to a callee that requires a non-null 1st parameter}} From 10d6562ff1bc0009024633b1fd6ab6c3abaea4b7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 30 May 2023 10:29:47 +0100 Subject: [PATCH 060/704] Fix "not all control paths return a value" MSVC warning. NFC. --- clang/lib/AST/Decl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index e441c338a2c76..99926b2786ef2 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3331,6 +3331,7 @@ bool FunctionDecl::isInlineBuiltinDeclaration() const { case GVA_StrongExternal: return true; } + llvm_unreachable("Unknown GVALinkage"); } bool FunctionDecl::isDestroyingOperatorDelete() const { From c644341c2cb71b04c4cdc9e18b2662b6e6beff64 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Tue, 30 May 2023 11:41:24 +0200 Subject: [PATCH 061/704] Revert "[mlir][bazel] Port for 660f714, third attempt" This reverts commit 421a7f814fb15dedde1b0b13a9e4ddcf7b502086. Dependency doesn't seem to be necessary and would pull in all of LLVM's codegen into mlir users that don't require it. --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 -- 1 file changed, 2 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index f85c05d595594..9fe4bf4d27d42 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -4039,7 +4039,6 @@ cc_library( ":BytecodeOpInterfaceIncGen", ":IR", ":Support", - "//llvm:CodeGen", "//llvm:Support", ], ) @@ -6836,7 +6835,6 @@ cc_library( ":BytecodeOpInterfaceIncGen", ":IR", ":Support", - "//llvm:CodeGen", "//llvm:Support", ], ) From 3ccb7702425a965836ca69fe75184698a59ee8f9 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 May 2023 11:59:09 +0200 Subject: [PATCH 062/704] [compiler-rt] Mark some performance critical buffers uninitialized With -ftrivial-auto-var-init, do not emit memset() calls for performance critical stack variables. Reviewed By: vitalybuka, dvyukov, MaskRay Differential Revision: https://reviews.llvm.org/D151551 --- compiler-rt/lib/asan/asan_stack.h | 32 +++++++++---------- compiler-rt/lib/msan/msan.h | 30 +++++++++-------- .../sanitizer_allocator_primary32.h | 2 +- .../sanitizer_internal_defs.h | 6 ++++ 4 files changed, 39 insertions(+), 31 deletions(-) diff --git a/compiler-rt/lib/asan/asan_stack.h b/compiler-rt/lib/asan/asan_stack.h index b9575d2f427ee..02a76af847ae6 100644 --- a/compiler-rt/lib/asan/asan_stack.h +++ b/compiler-rt/lib/asan/asan_stack.h @@ -32,24 +32,24 @@ u32 GetMallocContextSize(); // as early as possible (in functions exposed to the user), as we generally // don't want stack trace to contain functions from ASan internals. -#define GET_STACK_TRACE(max_size, fast) \ - BufferedStackTrace stack; \ - if (max_size <= 2) { \ - stack.size = max_size; \ - if (max_size > 0) { \ - stack.top_frame_bp = GET_CURRENT_FRAME(); \ - stack.trace_buffer[0] = StackTrace::GetCurrentPc(); \ - if (max_size > 1) stack.trace_buffer[1] = GET_CALLER_PC(); \ - } \ - } else { \ - stack.Unwind(StackTrace::GetCurrentPc(), \ - GET_CURRENT_FRAME(), nullptr, fast, max_size); \ +#define GET_STACK_TRACE(max_size, fast) \ + UNINITIALIZED BufferedStackTrace stack; \ + if (max_size <= 2) { \ + stack.size = max_size; \ + if (max_size > 0) { \ + stack.top_frame_bp = GET_CURRENT_FRAME(); \ + stack.trace_buffer[0] = StackTrace::GetCurrentPc(); \ + if (max_size > 1) \ + stack.trace_buffer[1] = GET_CALLER_PC(); \ + } \ + } else { \ + stack.Unwind(StackTrace::GetCurrentPc(), GET_CURRENT_FRAME(), nullptr, \ + fast, max_size); \ } -#define GET_STACK_TRACE_FATAL(pc, bp) \ - BufferedStackTrace stack; \ - stack.Unwind(pc, bp, nullptr, \ - common_flags()->fast_unwind_on_fatal) +#define GET_STACK_TRACE_FATAL(pc, bp) \ + UNINITIALIZED BufferedStackTrace stack; \ + stack.Unwind(pc, bp, nullptr, common_flags()->fast_unwind_on_fatal) #define GET_STACK_TRACE_FATAL_HERE \ GET_STACK_TRACE(kStackTraceMax, common_flags()->fast_unwind_on_fatal) diff --git a/compiler-rt/lib/msan/msan.h b/compiler-rt/lib/msan/msan.h index 5d8ea52668abe..50cbc5fe44d37 100644 --- a/compiler-rt/lib/msan/msan.h +++ b/compiler-rt/lib/msan/msan.h @@ -269,31 +269,33 @@ const int STACK_TRACE_TAG_POISON = StackTrace::TAG_CUSTOM + 1; const int STACK_TRACE_TAG_FIELDS = STACK_TRACE_TAG_POISON + 1; const int STACK_TRACE_TAG_VPTR = STACK_TRACE_TAG_FIELDS + 1; -#define GET_MALLOC_STACK_TRACE \ - BufferedStackTrace stack; \ - if (__msan_get_track_origins() && msan_inited) \ - stack.Unwind(StackTrace::GetCurrentPc(), GET_CURRENT_FRAME(), \ - nullptr, common_flags()->fast_unwind_on_malloc, \ - common_flags()->malloc_context_size) +#define GET_MALLOC_STACK_TRACE \ + UNINITIALIZED BufferedStackTrace stack; \ + if (__msan_get_track_origins() && msan_inited) { \ + stack.Unwind(StackTrace::GetCurrentPc(), GET_CURRENT_FRAME(), nullptr, \ + common_flags()->fast_unwind_on_malloc, \ + common_flags()->malloc_context_size); \ + } // For platforms which support slow unwinder only, we restrict the store context // size to 1, basically only storing the current pc. We do this because the slow // unwinder which is based on libunwind is not async signal safe and causes // random freezes in forking applications as well as in signal handlers. -#define GET_STORE_STACK_TRACE_PC_BP(pc, bp) \ - BufferedStackTrace stack; \ - if (__msan_get_track_origins() > 1 && msan_inited) { \ - int size = flags()->store_context_size; \ - if (!SANITIZER_CAN_FAST_UNWIND) \ - size = Min(size, 1); \ - stack.Unwind(pc, bp, nullptr, common_flags()->fast_unwind_on_malloc, size);\ +#define GET_STORE_STACK_TRACE_PC_BP(pc, bp) \ + UNINITIALIZED BufferedStackTrace stack; \ + if (__msan_get_track_origins() > 1 && msan_inited) { \ + int size = flags()->store_context_size; \ + if (!SANITIZER_CAN_FAST_UNWIND) \ + size = Min(size, 1); \ + stack.Unwind(pc, bp, nullptr, common_flags()->fast_unwind_on_malloc, \ + size); \ } #define GET_STORE_STACK_TRACE \ GET_STORE_STACK_TRACE_PC_BP(StackTrace::GetCurrentPc(), GET_CURRENT_FRAME()) #define GET_FATAL_STACK_TRACE_PC_BP(pc, bp) \ - BufferedStackTrace stack; \ + UNINITIALIZED BufferedStackTrace stack; \ if (msan_inited) { \ stack.Unwind(pc, bp, nullptr, common_flags()->fast_unwind_on_fatal); \ } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h index f2471efced613..52fe3fe3d15bd 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_primary32.h @@ -353,7 +353,7 @@ class SizeClassAllocator32 { DCHECK_GT(max_count, 0); TransferBatch *b = nullptr; constexpr uptr kShuffleArraySize = 48; - uptr shuffle_array[kShuffleArraySize]; + UNINITIALIZED uptr shuffle_array[kShuffleArraySize]; uptr count = 0; for (uptr i = region; i < region + n_chunks * size; i += size) { shuffle_array[count++] = i; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index ee1b3156c779e..95f4760cffd74 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -215,6 +215,7 @@ typedef u64 tid_t; # define UNLIKELY(x) (x) # define PREFETCH(x) /* _mm_prefetch(x, _MM_HINT_NTA) */ (void)0 # define WARN_UNUSED_RESULT +# define UNINITIALIZED #else // _MSC_VER # define ALWAYS_INLINE inline __attribute__((always_inline)) # define ALIAS(x) __attribute__((alias(SANITIZER_STRINGIFY(x)))) @@ -234,6 +235,11 @@ typedef u64 tid_t; # define PREFETCH(x) __builtin_prefetch(x) # endif # define WARN_UNUSED_RESULT __attribute__((warn_unused_result)) +# if __has_attribute(uninitialized) +# define UNINITIALIZED __attribute__((uninitialized)) +# else // __has_attribute(uninitialized) +# define UNINITIALIZED +# endif // __has_attribute(uninitialized) #endif // _MSC_VER #if !defined(_MSC_VER) || defined(__clang__) From c551c9c311b33a847390f6a57afda3b82d517675 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 May 2023 11:59:16 +0200 Subject: [PATCH 063/704] [compiler-rt] Refactor memintrinsic interceptors This moves memintrinsic interceptors (memcpy/memmove/memset) into a new file sanitizer_common_interceptors_memintrinsics.inc. This is in preparation of redefining builtins, however, we must be careful to not redefine builtins in TUs that define interceptors of the same name. In all cases except for MSan, memintrinsic interceptors were moved to a new TU $tool_interceptors_memintrinsics.cpp. In the case of MSan, it turns out this is not yet necessary (as shown by the later patch introducing memcpy tests). NFC. Reviewed By: vitalybuka Differential Revision: https://reviews.llvm.org/D151552 --- compiler-rt/lib/asan/asan_interceptors.cpp | 24 -- compiler-rt/lib/asan/asan_interceptors.h | 6 + .../asan/asan_interceptors_memintrinsics.cpp | 59 +++++ .../asan/asan_interceptors_memintrinsics.h | 37 --- .../lib/hwasan/hwasan_interceptors.cpp | 1 + .../lib/memprof/memprof_interceptors.cpp | 23 -- .../lib/memprof/memprof_interceptors.h | 6 + .../memprof_interceptors_memintrinsics.cpp | 61 +++++ .../memprof_interceptors_memintrinsics.h | 39 --- compiler-rt/lib/msan/msan_interceptors.cpp | 1 + .../lib/sanitizer_common/CMakeLists.txt | 1 + .../sanitizer_common_interceptors.inc | 215 +--------------- ...izer_common_interceptors_memintrinsics.inc | 236 ++++++++++++++++++ compiler-rt/lib/tsan/rtl/CMakeLists.txt | 1 + compiler-rt/lib/tsan/rtl/tsan_interceptors.h | 26 ++ .../rtl/tsan_interceptors_memintrinsics.cpp | 41 +++ .../lib/tsan/rtl/tsan_interceptors_posix.cpp | 48 +--- compiler-rt/lib/tsan/rtl/tsan_report.cpp | 1 + .../compiler-rt/lib/tsan/rtl/BUILD.gn | 1 + 19 files changed, 451 insertions(+), 376 deletions(-) create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc create mode 100644 compiler-rt/lib/tsan/rtl/tsan_interceptors_memintrinsics.cpp diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp index ad11c822be802..7aedefe81f95f 100644 --- a/compiler-rt/lib/asan/asan_interceptors.cpp +++ b/compiler-rt/lib/asan/asan_interceptors.cpp @@ -87,12 +87,6 @@ using namespace __asan; DECLARE_REAL_AND_INTERCEPTOR(void *, malloc, uptr) DECLARE_REAL_AND_INTERCEPTOR(void, free, void *) -#define ASAN_INTERCEPTOR_ENTER(ctx, func) \ - AsanInterceptorContext _ctx = {#func}; \ - ctx = (void *)&_ctx; \ - (void) ctx; \ - -#define COMMON_INTERCEPT_FUNCTION(name) ASAN_INTERCEPT_FUNC(name) #define COMMON_INTERCEPT_FUNCTION_VER(name, ver) \ ASAN_INTERCEPT_FUNC_VER(name, ver) #define COMMON_INTERCEPT_FUNCTION_VER_UNVERSIONED_FALLBACK(name, ver) \ @@ -152,24 +146,6 @@ DECLARE_REAL_AND_INTERCEPTOR(void, free, void *) *begin = *end = 0; \ } -#define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size) \ - do { \ - ASAN_INTERCEPTOR_ENTER(ctx, memmove); \ - ASAN_MEMMOVE_IMPL(ctx, to, from, size); \ - } while (false) - -#define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size) \ - do { \ - ASAN_INTERCEPTOR_ENTER(ctx, memcpy); \ - ASAN_MEMCPY_IMPL(ctx, to, from, size); \ - } while (false) - -#define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size) \ - do { \ - ASAN_INTERCEPTOR_ENTER(ctx, memset); \ - ASAN_MEMSET_IMPL(ctx, block, c, size); \ - } while (false) - #if CAN_SANITIZE_LEAKS #define COMMON_INTERCEPTOR_STRERROR() \ __lsan::ScopedInterceptorDisabler disabler diff --git a/compiler-rt/lib/asan/asan_interceptors.h b/compiler-rt/lib/asan/asan_interceptors.h index 93e8b8e1d545e..087189dc1f4d8 100644 --- a/compiler-rt/lib/asan/asan_interceptors.h +++ b/compiler-rt/lib/asan/asan_interceptors.h @@ -168,4 +168,10 @@ DECLARE_REAL(char*, strstr, const char *s1, const char *s2) #endif // !SANITIZER_FUCHSIA +#define ASAN_INTERCEPTOR_ENTER(ctx, func) \ + AsanInterceptorContext _ctx = {#func}; \ + ctx = (void *)&_ctx; \ + (void) ctx; +#define COMMON_INTERCEPT_FUNCTION(name) ASAN_INTERCEPT_FUNC(name) + #endif // ASAN_INTERCEPTORS_H diff --git a/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp b/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp index 9c316bb957493..83bb9fbcad4fd 100644 --- a/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp +++ b/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp @@ -12,12 +12,71 @@ //===---------------------------------------------------------------------===// #include "asan_interceptors_memintrinsics.h" + +#include "asan_interceptors.h" #include "asan_report.h" #include "asan_stack.h" #include "asan_suppressions.h" using namespace __asan; +// memcpy is called during __asan_init() from the internals of printf(...). +// We do not treat memcpy with to==from as a bug. +// See http://llvm.org/bugs/show_bug.cgi?id=11763. +#define ASAN_MEMCPY_IMPL(ctx, to, from, size) \ + do { \ + if (LIKELY(replace_intrin_cached)) { \ + if (LIKELY(to != from)) { \ + CHECK_RANGES_OVERLAP("memcpy", to, size, from, size); \ + } \ + ASAN_READ_RANGE(ctx, from, size); \ + ASAN_WRITE_RANGE(ctx, to, size); \ + } else if (UNLIKELY(!asan_inited)) { \ + return internal_memcpy(to, from, size); \ + } \ + return REAL(memcpy)(to, from, size); \ + } while (0) + +// memset is called inside Printf. +#define ASAN_MEMSET_IMPL(ctx, block, c, size) \ + do { \ + if (LIKELY(replace_intrin_cached)) { \ + ASAN_WRITE_RANGE(ctx, block, size); \ + } else if (UNLIKELY(!asan_inited)) { \ + return internal_memset(block, c, size); \ + } \ + return REAL(memset)(block, c, size); \ + } while (0) + +#define ASAN_MEMMOVE_IMPL(ctx, to, from, size) \ + do { \ + if (LIKELY(replace_intrin_cached)) { \ + ASAN_READ_RANGE(ctx, from, size); \ + ASAN_WRITE_RANGE(ctx, to, size); \ + } \ + return internal_memmove(to, from, size); \ + } while (0) + +#define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size) \ + do { \ + ASAN_INTERCEPTOR_ENTER(ctx, memmove); \ + ASAN_MEMMOVE_IMPL(ctx, to, from, size); \ + } while (false) + +#define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size) \ + do { \ + ASAN_INTERCEPTOR_ENTER(ctx, memcpy); \ + ASAN_MEMCPY_IMPL(ctx, to, from, size); \ + } while (false) + +#define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size) \ + do { \ + ASAN_INTERCEPTOR_ENTER(ctx, memset); \ + ASAN_MEMSET_IMPL(ctx, block, c, size); \ + } while (false) + +#include "sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc" + void *__asan_memcpy(void *to, const void *from, uptr size) { ASAN_MEMCPY_IMPL(nullptr, to, from, size); } diff --git a/compiler-rt/lib/asan/asan_interceptors_memintrinsics.h b/compiler-rt/lib/asan/asan_interceptors_memintrinsics.h index bbc5390ceaa40..eb44f8f2f729b 100644 --- a/compiler-rt/lib/asan/asan_interceptors_memintrinsics.h +++ b/compiler-rt/lib/asan/asan_interceptors_memintrinsics.h @@ -79,43 +79,6 @@ struct AsanInterceptorContext { } \ } while (0) -// memcpy is called during __asan_init() from the internals of printf(...). -// We do not treat memcpy with to==from as a bug. -// See http://llvm.org/bugs/show_bug.cgi?id=11763. -#define ASAN_MEMCPY_IMPL(ctx, to, from, size) \ - do { \ - if (LIKELY(replace_intrin_cached)) { \ - if (LIKELY(to != from)) { \ - CHECK_RANGES_OVERLAP("memcpy", to, size, from, size); \ - } \ - ASAN_READ_RANGE(ctx, from, size); \ - ASAN_WRITE_RANGE(ctx, to, size); \ - } else if (UNLIKELY(!asan_inited)) { \ - return internal_memcpy(to, from, size); \ - } \ - return REAL(memcpy)(to, from, size); \ - } while (0) - -// memset is called inside Printf. -#define ASAN_MEMSET_IMPL(ctx, block, c, size) \ - do { \ - if (LIKELY(replace_intrin_cached)) { \ - ASAN_WRITE_RANGE(ctx, block, size); \ - } else if (UNLIKELY(!asan_inited)) { \ - return internal_memset(block, c, size); \ - } \ - return REAL(memset)(block, c, size); \ - } while (0) - -#define ASAN_MEMMOVE_IMPL(ctx, to, from, size) \ - do { \ - if (LIKELY(replace_intrin_cached)) { \ - ASAN_READ_RANGE(ctx, from, size); \ - ASAN_WRITE_RANGE(ctx, to, size); \ - } \ - return internal_memmove(to, from, size); \ - } while (0) - #define ASAN_READ_RANGE(ctx, offset, size) \ ACCESS_MEMORY_RANGE(ctx, offset, size, false) #define ASAN_WRITE_RANGE(ctx, offset, size) \ diff --git a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp index 4eb5210e1b457..26109332a1dce 100644 --- a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp +++ b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp @@ -216,6 +216,7 @@ static void *mmap_interceptor(Mmap real_mmap, void *addr, SIZE_T length, return mmap_interceptor(REAL(mmap), addr, sz, prot, flags, fd, off); \ } while (false) +# include "sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc" # include "sanitizer_common/sanitizer_common_interceptors.inc" struct ThreadStartArg { diff --git a/compiler-rt/lib/memprof/memprof_interceptors.cpp b/compiler-rt/lib/memprof/memprof_interceptors.cpp index 31392a58f3dab..8925ec5bbaa37 100644 --- a/compiler-rt/lib/memprof/memprof_interceptors.cpp +++ b/compiler-rt/lib/memprof/memprof_interceptors.cpp @@ -52,11 +52,6 @@ using namespace __memprof; DECLARE_REAL_AND_INTERCEPTOR(void *, malloc, uptr) DECLARE_REAL_AND_INTERCEPTOR(void, free, void *) -#define MEMPROF_INTERCEPTOR_ENTER(ctx, func) \ - ctx = 0; \ - (void)ctx; - -#define COMMON_INTERCEPT_FUNCTION(name) MEMPROF_INTERCEPT_FUNC(name) #define COMMON_INTERCEPT_FUNCTION_VER(name, ver) \ MEMPROF_INTERCEPT_FUNC_VER(name, ver) #define COMMON_INTERCEPT_FUNCTION_VER_UNVERSIONED_FALLBACK(name, ver) \ @@ -105,24 +100,6 @@ DECLARE_REAL_AND_INTERCEPTOR(void, free, void *) *begin = *end = 0; \ } -#define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size) \ - do { \ - MEMPROF_INTERCEPTOR_ENTER(ctx, memmove); \ - MEMPROF_MEMMOVE_IMPL(to, from, size); \ - } while (false) - -#define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size) \ - do { \ - MEMPROF_INTERCEPTOR_ENTER(ctx, memcpy); \ - MEMPROF_MEMCPY_IMPL(to, from, size); \ - } while (false) - -#define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size) \ - do { \ - MEMPROF_INTERCEPTOR_ENTER(ctx, memset); \ - MEMPROF_MEMSET_IMPL(block, c, size); \ - } while (false) - #include "sanitizer_common/sanitizer_common_interceptors.inc" #define COMMON_SYSCALL_PRE_READ_RANGE(p, s) MEMPROF_READ_RANGE(p, s) diff --git a/compiler-rt/lib/memprof/memprof_interceptors.h b/compiler-rt/lib/memprof/memprof_interceptors.h index 879a1e1061e5e..20edef42a5150 100644 --- a/compiler-rt/lib/memprof/memprof_interceptors.h +++ b/compiler-rt/lib/memprof/memprof_interceptors.h @@ -57,4 +57,10 @@ DECLARE_REAL(char *, strstr, const char *s1, const char *s2) ver, #name); \ } while (0) +#define MEMPROF_INTERCEPTOR_ENTER(ctx, func) \ + ctx = 0; \ + (void)ctx; + +#define COMMON_INTERCEPT_FUNCTION(name) MEMPROF_INTERCEPT_FUNC(name) + #endif // MEMPROF_INTERCEPTORS_H diff --git a/compiler-rt/lib/memprof/memprof_interceptors_memintrinsics.cpp b/compiler-rt/lib/memprof/memprof_interceptors_memintrinsics.cpp index 4eb409362b57f..dae2ab5dbb9bc 100644 --- a/compiler-rt/lib/memprof/memprof_interceptors_memintrinsics.cpp +++ b/compiler-rt/lib/memprof/memprof_interceptors_memintrinsics.cpp @@ -12,10 +12,71 @@ //===---------------------------------------------------------------------===// #include "memprof_interceptors_memintrinsics.h" + +#include "memprof_interceptors.h" #include "memprof_stack.h" using namespace __memprof; +// memcpy is called during __memprof_init() from the internals of printf(...). +// We do not treat memcpy with to==from as a bug. +// See http://llvm.org/bugs/show_bug.cgi?id=11763. +#define MEMPROF_MEMCPY_IMPL(to, from, size) \ + do { \ + if (UNLIKELY(!memprof_inited)) \ + return internal_memcpy(to, from, size); \ + if (memprof_init_is_running) { \ + return REAL(memcpy)(to, from, size); \ + } \ + ENSURE_MEMPROF_INITED(); \ + MEMPROF_READ_RANGE(from, size); \ + MEMPROF_WRITE_RANGE(to, size); \ + return REAL(memcpy)(to, from, size); \ + } while (0) + +// memset is called inside Printf. +#define MEMPROF_MEMSET_IMPL(block, c, size) \ + do { \ + if (UNLIKELY(!memprof_inited)) \ + return internal_memset(block, c, size); \ + if (memprof_init_is_running) { \ + return REAL(memset)(block, c, size); \ + } \ + ENSURE_MEMPROF_INITED(); \ + MEMPROF_WRITE_RANGE(block, size); \ + return REAL(memset)(block, c, size); \ + } while (0) + +#define MEMPROF_MEMMOVE_IMPL(to, from, size) \ + do { \ + if (UNLIKELY(!memprof_inited)) \ + return internal_memmove(to, from, size); \ + ENSURE_MEMPROF_INITED(); \ + MEMPROF_READ_RANGE(from, size); \ + MEMPROF_WRITE_RANGE(to, size); \ + return internal_memmove(to, from, size); \ + } while (0) + +#define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size) \ + do { \ + MEMPROF_INTERCEPTOR_ENTER(ctx, memmove); \ + MEMPROF_MEMMOVE_IMPL(to, from, size); \ + } while (false) + +#define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size) \ + do { \ + MEMPROF_INTERCEPTOR_ENTER(ctx, memcpy); \ + MEMPROF_MEMCPY_IMPL(to, from, size); \ + } while (false) + +#define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size) \ + do { \ + MEMPROF_INTERCEPTOR_ENTER(ctx, memset); \ + MEMPROF_MEMSET_IMPL(block, c, size); \ + } while (false) + +#include "sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc" + void *__memprof_memcpy(void *to, const void *from, uptr size) { MEMPROF_MEMCPY_IMPL(to, from, size); } diff --git a/compiler-rt/lib/memprof/memprof_interceptors_memintrinsics.h b/compiler-rt/lib/memprof/memprof_interceptors_memintrinsics.h index 348461d55c414..0b87a6f3522af 100644 --- a/compiler-rt/lib/memprof/memprof_interceptors_memintrinsics.h +++ b/compiler-rt/lib/memprof/memprof_interceptors_memintrinsics.h @@ -32,45 +32,6 @@ namespace __memprof { __memprof_record_access_range(offset, size); \ } while (0) -// memcpy is called during __memprof_init() from the internals of printf(...). -// We do not treat memcpy with to==from as a bug. -// See http://llvm.org/bugs/show_bug.cgi?id=11763. -#define MEMPROF_MEMCPY_IMPL(to, from, size) \ - do { \ - if (UNLIKELY(!memprof_inited)) \ - return internal_memcpy(to, from, size); \ - if (memprof_init_is_running) { \ - return REAL(memcpy)(to, from, size); \ - } \ - ENSURE_MEMPROF_INITED(); \ - MEMPROF_READ_RANGE(from, size); \ - MEMPROF_WRITE_RANGE(to, size); \ - return REAL(memcpy)(to, from, size); \ - } while (0) - -// memset is called inside Printf. -#define MEMPROF_MEMSET_IMPL(block, c, size) \ - do { \ - if (UNLIKELY(!memprof_inited)) \ - return internal_memset(block, c, size); \ - if (memprof_init_is_running) { \ - return REAL(memset)(block, c, size); \ - } \ - ENSURE_MEMPROF_INITED(); \ - MEMPROF_WRITE_RANGE(block, size); \ - return REAL(memset)(block, c, size); \ - } while (0) - -#define MEMPROF_MEMMOVE_IMPL(to, from, size) \ - do { \ - if (UNLIKELY(!memprof_inited)) \ - return internal_memmove(to, from, size); \ - ENSURE_MEMPROF_INITED(); \ - MEMPROF_READ_RANGE(from, size); \ - MEMPROF_WRITE_RANGE(to, size); \ - return internal_memmove(to, from, size); \ - } while (0) - #define MEMPROF_READ_RANGE(offset, size) ACCESS_MEMORY_RANGE(offset, size) #define MEMPROF_WRITE_RANGE(offset, size) ACCESS_MEMORY_RANGE(offset, size) diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp index ed5e91483ae93..96abc47305cad 100644 --- a/compiler-rt/lib/msan/msan_interceptors.cpp +++ b/compiler-rt/lib/msan/msan_interceptors.cpp @@ -1421,6 +1421,7 @@ int OnExit() { } while (false) #include "sanitizer_common/sanitizer_platform_interceptors.h" +#include "sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc" #include "sanitizer_common/sanitizer_common_interceptors.inc" static uptr signal_impl(int signo, uptr cb); diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt index 614e63d5f9eb5..c4fdc7aeb4e40 100644 --- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt +++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt @@ -127,6 +127,7 @@ set(SANITIZER_IMPL_HEADERS sanitizer_common_interceptors.inc sanitizer_common_interceptors_format.inc sanitizer_common_interceptors_ioctl.inc + sanitizer_common_interceptors_memintrinsics.inc sanitizer_common_interface.inc sanitizer_common_interface_posix.inc sanitizer_common_syscalls.inc diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc index db090d2d8fa27..efd7c75a18209 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc @@ -26,9 +26,6 @@ // COMMON_INTERCEPTOR_SET_PTHREAD_NAME // COMMON_INTERCEPTOR_HANDLE_RECVMSG // COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED -// COMMON_INTERCEPTOR_MEMSET_IMPL -// COMMON_INTERCEPTOR_MEMMOVE_IMPL -// COMMON_INTERCEPTOR_MEMCPY_IMPL // COMMON_INTERCEPTOR_MMAP_IMPL // COMMON_INTERCEPTOR_COPY_STRING // COMMON_INTERCEPTOR_STRNDUP_IMPL @@ -198,15 +195,6 @@ extern const short *_tolower_tab_; #define wait4 __wait4_time64 #endif -// Platform-specific options. -#if SANITIZER_APPLE -#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 -#elif SANITIZER_WINDOWS64 -#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 -#else -#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1 -#endif // SANITIZER_APPLE - #ifndef COMMON_INTERCEPTOR_INITIALIZE_RANGE #define COMMON_INTERCEPTOR_INITIALIZE_RANGE(p, size) {} #endif @@ -302,47 +290,6 @@ extern const short *_tolower_tab_; COMMON_INTERCEPT_FUNCTION(fn) #endif -#ifndef COMMON_INTERCEPTOR_MEMSET_IMPL -#define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, dst, v, size) \ - { \ - if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) \ - return internal_memset(dst, v, size); \ - COMMON_INTERCEPTOR_ENTER(ctx, memset, dst, v, size); \ - if (common_flags()->intercept_intrin) \ - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size); \ - return REAL(memset)(dst, v, size); \ - } -#endif - -#ifndef COMMON_INTERCEPTOR_MEMMOVE_IMPL -#define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size) \ - { \ - if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) \ - return internal_memmove(dst, src, size); \ - COMMON_INTERCEPTOR_ENTER(ctx, memmove, dst, src, size); \ - if (common_flags()->intercept_intrin) { \ - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size); \ - COMMON_INTERCEPTOR_READ_RANGE(ctx, src, size); \ - } \ - return REAL(memmove)(dst, src, size); \ - } -#endif - -#ifndef COMMON_INTERCEPTOR_MEMCPY_IMPL -#define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, dst, src, size) \ - { \ - if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) { \ - return internal_memmove(dst, src, size); \ - } \ - COMMON_INTERCEPTOR_ENTER(ctx, memcpy, dst, src, size); \ - if (common_flags()->intercept_intrin) { \ - COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size); \ - COMMON_INTERCEPTOR_READ_RANGE(ctx, src, size); \ - } \ - return REAL(memcpy)(dst, src, size); \ - } -#endif - #ifndef COMMON_INTERCEPTOR_MMAP_IMPL #define COMMON_INTERCEPTOR_MMAP_IMPL(ctx, mmap, addr, sz, prot, flags, fd, \ off) \ @@ -841,57 +788,6 @@ INTERCEPTOR(char *, strpbrk, const char *s1, const char *s2) { #define INIT_STRPBRK #endif -#if SANITIZER_INTERCEPT_MEMSET -INTERCEPTOR(void *, memset, void *dst, int v, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, dst, v, size); -} - -#define INIT_MEMSET COMMON_INTERCEPT_FUNCTION(memset) -#else -#define INIT_MEMSET -#endif - -#if SANITIZER_INTERCEPT_MEMMOVE -INTERCEPTOR(void *, memmove, void *dst, const void *src, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size); -} - -#define INIT_MEMMOVE COMMON_INTERCEPT_FUNCTION(memmove) -#else -#define INIT_MEMMOVE -#endif - -#if SANITIZER_INTERCEPT_MEMCPY -INTERCEPTOR(void *, memcpy, void *dst, const void *src, uptr size) { - // On OS X, calling internal_memcpy here will cause memory corruptions, - // because memcpy and memmove are actually aliases of the same - // implementation. We need to use internal_memmove here. - // N.B.: If we switch this to internal_ we'll have to use internal_memmove - // due to memcpy being an alias of memmove on OS X. - void *ctx; -#if PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE - COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, dst, src, size); -#else - COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size); -#endif -} - -#define INIT_MEMCPY \ - do { \ - if (PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE) { \ - COMMON_INTERCEPT_FUNCTION(memcpy); \ - } else { \ - ASSIGN_REAL(memcpy, memmove); \ - } \ - CHECK(REAL(memcpy)); \ - } while (false) - -#else -#define INIT_MEMCPY -#endif - #if SANITIZER_INTERCEPT_MEMCMP DECLARE_WEAK_INTERCEPTOR_HOOK(__sanitizer_weak_hook_memcmp, uptr called_pc, const void *s1, const void *s2, uptr n, @@ -5791,105 +5687,6 @@ INTERCEPTOR(int, capset, void *hdrp, const void *datap) { #define INIT_CAPGET #endif -#if SANITIZER_INTERCEPT_AEABI_MEM -INTERCEPTOR(void *, __aeabi_memmove, void *to, const void *from, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size); -} - -INTERCEPTOR(void *, __aeabi_memmove4, void *to, const void *from, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size); -} - -INTERCEPTOR(void *, __aeabi_memmove8, void *to, const void *from, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size); -} - -INTERCEPTOR(void *, __aeabi_memcpy, void *to, const void *from, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size); -} - -INTERCEPTOR(void *, __aeabi_memcpy4, void *to, const void *from, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size); -} - -INTERCEPTOR(void *, __aeabi_memcpy8, void *to, const void *from, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size); -} - -// Note the argument order. -INTERCEPTOR(void *, __aeabi_memset, void *block, uptr size, int c) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size); -} - -INTERCEPTOR(void *, __aeabi_memset4, void *block, uptr size, int c) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size); -} - -INTERCEPTOR(void *, __aeabi_memset8, void *block, uptr size, int c) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size); -} - -INTERCEPTOR(void *, __aeabi_memclr, void *block, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); -} - -INTERCEPTOR(void *, __aeabi_memclr4, void *block, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); -} - -INTERCEPTOR(void *, __aeabi_memclr8, void *block, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); -} - -#define INIT_AEABI_MEM \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memmove); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memmove4); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memmove8); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memcpy); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memcpy4); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memcpy8); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memset); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memset4); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memset8); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memclr); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memclr4); \ - COMMON_INTERCEPT_FUNCTION(__aeabi_memclr8); -#else -#define INIT_AEABI_MEM -#endif // SANITIZER_INTERCEPT_AEABI_MEM - -#if SANITIZER_INTERCEPT___BZERO -INTERCEPTOR(void *, __bzero, void *block, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); -} -#define INIT___BZERO COMMON_INTERCEPT_FUNCTION(__bzero); -#else -#define INIT___BZERO -#endif // SANITIZER_INTERCEPT___BZERO - -#if SANITIZER_INTERCEPT_BZERO -INTERCEPTOR(void *, bzero, void *block, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); -} -#define INIT_BZERO COMMON_INTERCEPT_FUNCTION(bzero); -#else -#define INIT_BZERO -#endif // SANITIZER_INTERCEPT_BZERO - #if SANITIZER_INTERCEPT_FTIME INTERCEPTOR(int, ftime, __sanitizer_timeb *tp) { void *ctx; @@ -10362,12 +10159,18 @@ INTERCEPTOR(int, argp_parse, const struct argp *argp, int argc, char **argv, #include "sanitizer_common_interceptors_netbsd_compat.inc" +namespace __sanitizer { +void InitializeMemintrinsicInterceptors(); +} // namespace __sanitizer + static void InitializeCommonInterceptors() { #if SI_POSIX static u64 metadata_mem[sizeof(MetadataHashMap) / sizeof(u64) + 1]; interceptor_metadata_map = new ((void *)&metadata_mem) MetadataHashMap(); #endif + __sanitizer::InitializeMemintrinsicInterceptors(); + INIT_MMAP; INIT_MMAP64; INIT_TEXTDOMAIN; @@ -10389,9 +10192,6 @@ static void InitializeCommonInterceptors() { INIT_STRPBRK; INIT_STRXFRM; INIT___STRXFRM_L; - INIT_MEMSET; - INIT_MEMMOVE; - INIT_MEMCPY; INIT_MEMCHR; INIT_MEMCMP; INIT_BCMP; @@ -10563,9 +10363,6 @@ static void InitializeCommonInterceptors() { INIT_GETIFADDRS; INIT_IF_INDEXTONAME; INIT_CAPGET; - INIT_AEABI_MEM; - INIT___BZERO; - INIT_BZERO; INIT_FTIME; INIT_XDR; INIT_XDRREC_LINUX; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc new file mode 100644 index 0000000000000..e6b967c48b2e1 --- /dev/null +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc @@ -0,0 +1,236 @@ +//===-- sanitizer_common_interceptors_memintrinsics.inc ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Memintrinsic function interceptors for tools like AddressSanitizer, +// ThreadSanitizer, MemorySanitizer, etc. +// +// This file should be included into the tool's memintrinsic interceptor file, +// which has to define its own macros: +// COMMON_INTERCEPTOR_ENTER +// COMMON_INTERCEPTOR_READ_RANGE +// COMMON_INTERCEPTOR_WRITE_RANGE +// COMMON_INTERCEPTOR_MEMSET_IMPL +// COMMON_INTERCEPTOR_MEMMOVE_IMPL +// COMMON_INTERCEPTOR_MEMCPY_IMPL +// COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED +//===----------------------------------------------------------------------===// + +#include "interception/interception.h" +#include "sanitizer_platform_interceptors.h" + +// Platform-specific options. +#if SANITIZER_APPLE +#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 +#elif SANITIZER_WINDOWS64 +#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 +#else +#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1 +#endif // SANITIZER_APPLE + +#ifndef COMMON_INTERCEPTOR_MEMSET_IMPL +#define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, dst, v, size) \ + { \ + if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) \ + return internal_memset(dst, v, size); \ + COMMON_INTERCEPTOR_ENTER(ctx, memset, dst, v, size); \ + if (common_flags()->intercept_intrin) \ + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size); \ + return REAL(memset)(dst, v, size); \ + } +#endif + +#ifndef COMMON_INTERCEPTOR_MEMMOVE_IMPL +#define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size) \ + { \ + if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) \ + return internal_memmove(dst, src, size); \ + COMMON_INTERCEPTOR_ENTER(ctx, memmove, dst, src, size); \ + if (common_flags()->intercept_intrin) { \ + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size); \ + COMMON_INTERCEPTOR_READ_RANGE(ctx, src, size); \ + } \ + return REAL(memmove)(dst, src, size); \ + } +#endif + +#ifndef COMMON_INTERCEPTOR_MEMCPY_IMPL +#define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, dst, src, size) \ + { \ + if (COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED) { \ + return internal_memmove(dst, src, size); \ + } \ + COMMON_INTERCEPTOR_ENTER(ctx, memcpy, dst, src, size); \ + if (common_flags()->intercept_intrin) { \ + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, dst, size); \ + COMMON_INTERCEPTOR_READ_RANGE(ctx, src, size); \ + } \ + return REAL(memcpy)(dst, src, size); \ + } +#endif + +#if SANITIZER_INTERCEPT_MEMSET +INTERCEPTOR(void *, memset, void *dst, int v, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, dst, v, size); +} + +#define INIT_MEMSET COMMON_INTERCEPT_FUNCTION(memset) +#else +#define INIT_MEMSET +#endif + +#if SANITIZER_INTERCEPT_MEMMOVE +INTERCEPTOR(void *, memmove, void *dst, const void *src, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size); +} + +#define INIT_MEMMOVE COMMON_INTERCEPT_FUNCTION(memmove) +#else +#define INIT_MEMMOVE +#endif + +#if SANITIZER_INTERCEPT_MEMCPY +INTERCEPTOR(void *, memcpy, void *dst, const void *src, uptr size) { + // On OS X, calling internal_memcpy here will cause memory corruptions, + // because memcpy and memmove are actually aliases of the same + // implementation. We need to use internal_memmove here. + // N.B.: If we switch this to internal_ we'll have to use internal_memmove + // due to memcpy being an alias of memmove on OS X. + void *ctx; +#if PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE + COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, dst, src, size); +#else + COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size); +#endif +} + +#define INIT_MEMCPY \ + do { \ + if (PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE) { \ + COMMON_INTERCEPT_FUNCTION(memcpy); \ + } else { \ + ASSIGN_REAL(memcpy, memmove); \ + } \ + CHECK(REAL(memcpy)); \ + } while (false) + +#else +#define INIT_MEMCPY +#endif + +#if SANITIZER_INTERCEPT_AEABI_MEM +INTERCEPTOR(void *, __aeabi_memmove, void *to, const void *from, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size); +} + +INTERCEPTOR(void *, __aeabi_memmove4, void *to, const void *from, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size); +} + +INTERCEPTOR(void *, __aeabi_memmove8, void *to, const void *from, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size); +} + +INTERCEPTOR(void *, __aeabi_memcpy, void *to, const void *from, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size); +} + +INTERCEPTOR(void *, __aeabi_memcpy4, void *to, const void *from, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size); +} + +INTERCEPTOR(void *, __aeabi_memcpy8, void *to, const void *from, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size); +} + +// Note the argument order. +INTERCEPTOR(void *, __aeabi_memset, void *block, uptr size, int c) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size); +} + +INTERCEPTOR(void *, __aeabi_memset4, void *block, uptr size, int c) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size); +} + +INTERCEPTOR(void *, __aeabi_memset8, void *block, uptr size, int c) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size); +} + +INTERCEPTOR(void *, __aeabi_memclr, void *block, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); +} + +INTERCEPTOR(void *, __aeabi_memclr4, void *block, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); +} + +INTERCEPTOR(void *, __aeabi_memclr8, void *block, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); +} + +#define INIT_AEABI_MEM \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memmove); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memmove4); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memmove8); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memcpy); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memcpy4); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memcpy8); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memset); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memset4); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memset8); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memclr); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memclr4); \ + COMMON_INTERCEPT_FUNCTION(__aeabi_memclr8); +#else +#define INIT_AEABI_MEM +#endif // SANITIZER_INTERCEPT_AEABI_MEM + +#if SANITIZER_INTERCEPT___BZERO +INTERCEPTOR(void *, __bzero, void *block, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); +} +#define INIT___BZERO COMMON_INTERCEPT_FUNCTION(__bzero); +#else +#define INIT___BZERO +#endif // SANITIZER_INTERCEPT___BZERO + +#if SANITIZER_INTERCEPT_BZERO +INTERCEPTOR(void *, bzero, void *block, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, 0, size); +} +#define INIT_BZERO COMMON_INTERCEPT_FUNCTION(bzero); +#else +#define INIT_BZERO +#endif // SANITIZER_INTERCEPT_BZERO + +namespace __sanitizer { +// This does not need to be called if InitializeCommonInterceptors() is called. +void InitializeMemintrinsicInterceptors() { + INIT_MEMSET; + INIT_MEMMOVE; + INIT_MEMCPY; + INIT_AEABI_MEM; + INIT___BZERO; + INIT_BZERO; +} +} // namespace __sanitizer diff --git a/compiler-rt/lib/tsan/rtl/CMakeLists.txt b/compiler-rt/lib/tsan/rtl/CMakeLists.txt index 7ad91b3cddd18..c9e19e0f16c58 100644 --- a/compiler-rt/lib/tsan/rtl/CMakeLists.txt +++ b/compiler-rt/lib/tsan/rtl/CMakeLists.txt @@ -27,6 +27,7 @@ set(TSAN_SOURCES tsan_fd.cpp tsan_flags.cpp tsan_ignoreset.cpp + tsan_interceptors_memintrinsics.cpp tsan_interceptors_posix.cpp tsan_interface.cpp tsan_interface_ann.cpp diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors.h b/compiler-rt/lib/tsan/rtl/tsan_interceptors.h index 64a7fe2a9997b..a357a870fdf8e 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors.h +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors.h @@ -29,6 +29,11 @@ class ScopedInterceptor { void EnableIgnoresImpl(); }; +struct TsanInterceptorContext { + ThreadState *thr; + const uptr pc; +}; + LibIgnore *libignore(); #if !SANITIZER_GO @@ -103,4 +108,25 @@ inline bool MustIgnoreInterceptor(ThreadState *thr) { # define TSAN_INTERCEPTOR_NETBSD_ALIAS_THR2(ret, func, func2, ...) #endif +#define COMMON_INTERCEPT_FUNCTION(name) INTERCEPT_FUNCTION(name) + +#define COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED \ + (!cur_thread_init()->is_inited) + +#define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size) \ + MemoryAccessRange(((TsanInterceptorContext *)ctx)->thr, \ + ((TsanInterceptorContext *)ctx)->pc, (uptr)ptr, size, \ + true) + +#define COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, size) \ + MemoryAccessRange(((TsanInterceptorContext *) ctx)->thr, \ + ((TsanInterceptorContext *) ctx)->pc, (uptr) ptr, size, \ + false) + +#define COMMON_INTERCEPTOR_ENTER(ctx, func, ...) \ + SCOPED_TSAN_INTERCEPTOR(func, __VA_ARGS__); \ + TsanInterceptorContext _ctx = {thr, pc}; \ + ctx = (void *)&_ctx; \ + (void)ctx; + #endif // TSAN_INTERCEPTORS_H diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_memintrinsics.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_memintrinsics.cpp new file mode 100644 index 0000000000000..6a2a4298c217a --- /dev/null +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_memintrinsics.cpp @@ -0,0 +1,41 @@ +//===-- tsan_interceptors_posix.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of ThreadSanitizer (TSan), a race detector. +// +//===----------------------------------------------------------------------===// + +#include "tsan_interceptors.h" +#include "tsan_interface.h" + +using namespace __tsan; + +#include "sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc" + +extern "C" { + +void *__tsan_memcpy(void *dst, const void *src, uptr size) { + void *ctx; +#if PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE + COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, dst, src, size); +#else + COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size); +#endif +} + +void *__tsan_memset(void *dst, int c, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, dst, c, size); +} + +void *__tsan_memmove(void *dst, const void *src, uptr size) { + void *ctx; + COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size); +} + +} // extern "C" diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp index caf9a46db625c..1e4e5bda81e9b 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp @@ -35,6 +35,9 @@ using namespace __tsan; +DECLARE_REAL(void *, memcpy, void *to, const void *from, SIZE_T size) +DECLARE_REAL(void *, memset, void *block, int c, SIZE_T size) + #if SANITIZER_FREEBSD || SANITIZER_APPLE #define stdout __stdoutp #define stderr __stderrp @@ -158,9 +161,6 @@ const int SA_SIGINFO = 4; const int SIG_SETMASK = 2; #endif -#define COMMON_INTERCEPTOR_NOTHING_IS_INITIALIZED \ - (!cur_thread_init()->is_inited) - namespace __tsan { struct SignalDesc { bool armed; @@ -2391,11 +2391,6 @@ static int OnExit(ThreadState *thr) { return status; } -struct TsanInterceptorContext { - ThreadState *thr; - const uptr pc; -}; - #if !SANITIZER_APPLE static void HandleRecvmsg(ThreadState *thr, uptr pc, __sanitizer_msghdr *msg) { @@ -2417,28 +2412,11 @@ static void HandleRecvmsg(ThreadState *thr, uptr pc, #define SANITIZER_INTERCEPT_TLS_GET_OFFSET 1 #undef SANITIZER_INTERCEPT_PTHREAD_SIGMASK -#define COMMON_INTERCEPT_FUNCTION(name) INTERCEPT_FUNCTION(name) #define COMMON_INTERCEPT_FUNCTION_VER(name, ver) \ INTERCEPT_FUNCTION_VER(name, ver) #define COMMON_INTERCEPT_FUNCTION_VER_UNVERSIONED_FALLBACK(name, ver) \ (INTERCEPT_FUNCTION_VER(name, ver) || INTERCEPT_FUNCTION(name)) -#define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size) \ - MemoryAccessRange(((TsanInterceptorContext *)ctx)->thr, \ - ((TsanInterceptorContext *)ctx)->pc, (uptr)ptr, size, \ - true) - -#define COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, size) \ - MemoryAccessRange(((TsanInterceptorContext *) ctx)->thr, \ - ((TsanInterceptorContext *) ctx)->pc, (uptr) ptr, size, \ - false) - -#define COMMON_INTERCEPTOR_ENTER(ctx, func, ...) \ - SCOPED_TSAN_INTERCEPTOR(func, __VA_ARGS__); \ - TsanInterceptorContext _ctx = {thr, pc}; \ - ctx = (void *)&_ctx; \ - (void)ctx; - #define COMMON_INTERCEPTOR_ENTER_NOIGNORE(ctx, func, ...) \ SCOPED_INTERCEPTOR_RAW(func, __VA_ARGS__); \ TsanInterceptorContext _ctx = {thr, pc}; \ @@ -3131,22 +3109,4 @@ SANITIZER_INTERFACE_ATTRIBUTE void __tsan_testonly_barrier_wait( } } -void *__tsan_memcpy(void *dst, const void *src, uptr size) { - void *ctx; -#if PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE - COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, dst, src, size); -#else - COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size); -#endif -} - -void *__tsan_memset(void *dst, int c, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, dst, c, size); -} - -void *__tsan_memmove(void *dst, const void *src, uptr size) { - void *ctx; - COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, dst, src, size); -} -} +} // extern "C" diff --git a/compiler-rt/lib/tsan/rtl/tsan_report.cpp b/compiler-rt/lib/tsan/rtl/tsan_report.cpp index 7c8d1253a3ec8..3ae666e1212f7 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_report.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_report.cpp @@ -278,6 +278,7 @@ static bool FrameIsInternal(const SymbolizedStack *frame) { const char *module = frame->info.module; if (file != 0 && (internal_strstr(file, "tsan_interceptors_posix.cpp") || + internal_strstr(file, "tsan_interceptors_memintrinsics.cpp") || internal_strstr(file, "sanitizer_common_interceptors.inc") || internal_strstr(file, "tsan_interface_"))) return true; diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/tsan/rtl/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/tsan/rtl/BUILD.gn index 429afd835d28c..3c1e1fa6ef030 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/tsan/rtl/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/tsan/rtl/BUILD.gn @@ -59,6 +59,7 @@ target(tsan_target_type, "rtl") { "tsan_ignoreset.h", "tsan_ilist.h", "tsan_interceptors.h", + "tsan_interceptors_memintrinsics.cpp", "tsan_interceptors_posix.cpp", "tsan_interface.cpp", "tsan_interface.h", From c4efcd6970e22e523e9f0088614dbcade05491bc Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Tue, 30 May 2023 10:59:15 +0100 Subject: [PATCH 064/704] [RISCV] Generalise shouldExtendTypeInLibcall logic to apply to all Date: Mon, 22 May 2023 16:09:51 +0200 Subject: [PATCH 065/704] [InstCombine] Remove computeKnownBits() fold for returns We try to fold constant computeKnownBits() with context for return instructions only. Otherwise, we rely on SimplifyDemandedBits() to fold instructions with constant known bits. The presence of this special fold for returns is dangerous, because it makes our tests lie about what works and what doesn't. Tests are usually written by returning the result we're interested in, but will go through this separate code path that is not used for anything else. This patch removes the special fold. This primarily regresses patterns of the style "assume(x); return x". The responsibility of handling such patterns lies with passes like EarlyCSE/GVN anyway, which will do this reliably, and not just for returns. Differential Revision: https://reviews.llvm.org/D151099 --- .../InstCombine/InstructionCombining.cpp | 20 +------------------ llvm/test/Transforms/InstCombine/assume.ll | 18 ++++++++--------- .../Transforms/InstCombine/known-phi-br.ll | 6 ++++-- .../Transforms/InstCombine/zext-or-icmp.ll | 2 +- 4 files changed, 14 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 2af6ba5c81822..682005282d92a 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2471,25 +2471,7 @@ static bool isMustTailCall(Value *V) { } Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) { - if (RI.getNumOperands() == 0) // ret void - return nullptr; - - Value *ResultOp = RI.getOperand(0); - Type *VTy = ResultOp->getType(); - if (!VTy->isIntegerTy() || isa(ResultOp)) - return nullptr; - - // Don't replace result of musttail calls. - if (isMustTailCall(ResultOp)) - return nullptr; - - // There might be assume intrinsics dominating this return that completely - // determine the value. If so, constant fold it. - KnownBits Known = computeKnownBits(ResultOp, 0, &RI); - if (Known.isConstant()) - return replaceOperand(RI, 0, - Constant::getIntegerValue(VTy, Known.getConstant())); - + // Nothing for now. return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index 467ecec60a3f5..83ff0e3a392dd 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S -instcombine-infinite-loop-threshold=2 | FileCheck --check-prefixes=CHECK,DEFAULT %s -; RUN: opt < %s -passes=instcombine --enable-knowledge-retention -S -instcombine-infinite-loop-threshold=2 | FileCheck --check-prefixes=CHECK,BUNDLES %s +; RUN: opt < %s -passes=instcombine -S -instcombine-infinite-loop-threshold=3 | FileCheck --check-prefixes=CHECK,DEFAULT %s +; RUN: opt < %s -passes=instcombine --enable-knowledge-retention -S -instcombine-infinite-loop-threshold=3 | FileCheck --check-prefixes=CHECK,BUNDLES %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -60,7 +60,7 @@ define i32 @simple(i32 %a) #1 { ; CHECK-LABEL: @simple( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 4 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]]) -; CHECK-NEXT: ret i32 4 +; CHECK-NEXT: ret i32 [[A]] ; %cmp = icmp eq i32 %a, 4 tail call void @llvm.assume(i1 %cmp) @@ -204,7 +204,8 @@ define i32 @icmp1(i32 %a) #0 { ; CHECK-LABEL: @icmp1( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]]) -; CHECK-NEXT: ret i32 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] ; %cmp = icmp sgt i32 %a, 5 tail call void @llvm.assume(i1 %cmp) @@ -231,7 +232,7 @@ define i1 @assume_not(i1 %cond) { ; CHECK-LABEL: @assume_not( ; CHECK-NEXT: [[NOTCOND:%.*]] = xor i1 [[COND:%.*]], true ; CHECK-NEXT: call void @llvm.assume(i1 [[NOTCOND]]) -; CHECK-NEXT: ret i1 false +; CHECK-NEXT: ret i1 [[COND]] ; %notcond = xor i1 %cond, true call void @llvm.assume(i1 %notcond) @@ -382,10 +383,7 @@ define i1 @nonnull5(ptr %a) { define i32 @assumption_conflicts_with_known_bits(i32 %a, i32 %b) { ; CHECK-LABEL: @assumption_conflicts_with_known_bits( -; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B:%.*]], 3 ; CHECK-NEXT: tail call void @llvm.assume(i1 false) -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[AND1]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) ; CHECK-NEXT: ret i32 0 ; %and1 = and i32 %b, 3 @@ -451,7 +449,7 @@ define i1 @nonnull3A(ptr %a, i1 %control) { ; DEFAULT: taken: ; DEFAULT-NEXT: [[CMP:%.*]] = icmp ne ptr [[LOAD]], null ; DEFAULT-NEXT: call void @llvm.assume(i1 [[CMP]]) -; DEFAULT-NEXT: ret i1 true +; DEFAULT-NEXT: ret i1 [[CMP]] ; DEFAULT: not_taken: ; DEFAULT-NEXT: [[RVAL_2:%.*]] = icmp sgt ptr [[LOAD]], null ; DEFAULT-NEXT: ret i1 [[RVAL_2]] @@ -487,7 +485,7 @@ define i1 @nonnull3B(ptr %a, i1 %control) { ; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[A:%.*]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne ptr [[LOAD]], null ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) [ "nonnull"(ptr [[LOAD]]) ] -; CHECK-NEXT: ret i1 true +; CHECK-NEXT: ret i1 [[CMP]] ; CHECK: not_taken: ; CHECK-NEXT: ret i1 [[CONTROL]] ; diff --git a/llvm/test/Transforms/InstCombine/known-phi-br.ll b/llvm/test/Transforms/InstCombine/known-phi-br.ll index 64d3344eb2066..1ad0ed42d8d34 100644 --- a/llvm/test/Transforms/InstCombine/known-phi-br.ll +++ b/llvm/test/Transforms/InstCombine/known-phi-br.ll @@ -15,7 +15,8 @@ define i64 @limit_i64_eq_7(i64 %x) { ; CHECK: body: ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: ret i64 7 +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ 7, [[BODY]] ] +; CHECK-NEXT: ret i64 [[RES]] ; entry: %cmp = icmp eq i64 %x, 7 @@ -37,7 +38,8 @@ define i64 @limit_i64_ne_255(i64 %x) { ; CHECK: body: ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: ret i64 255 +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ [[X]], [[ENTRY:%.*]] ], [ 255, [[BODY]] ] +; CHECK-NEXT: ret i64 [[RES]] ; entry: %cmp = icmp ne i64 %x, 255 diff --git a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll index 655c2e7c083a9..7d57cd21f4e83 100644 --- a/llvm/test/Transforms/InstCombine/zext-or-icmp.ll +++ b/llvm/test/Transforms/InstCombine/zext-or-icmp.ll @@ -251,7 +251,7 @@ define i1 @PR51762(ptr %i, i32 %t0, i16 %t1, ptr %p, ptr %d, ptr %f, i32 %p2, i1 ; CHECK-NEXT: store i32 [[SROA38]], ptr [[D]], align 8 ; CHECK-NEXT: [[R:%.*]] = icmp ult i64 [[INSERT_INSERT41]], [[CONV19]] ; CHECK-NEXT: call void @llvm.assume(i1 [[R]]) -; CHECK-NEXT: ret i1 true +; CHECK-NEXT: ret i1 [[R]] ; entry: br label %for.cond From d0a4dcf52f655681b42c023002ac5b42d7275c25 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Tue, 30 May 2023 18:30:44 +0800 Subject: [PATCH 066/704] [InstCombine] Remove unused function 'isMustTailCall' (NFC) /data/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:2467:13: error: unused function 'isMustTailCall' [-Werror,-Wunused-function] static bool isMustTailCall(Value *V) { ^ 1 error generated. --- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 682005282d92a..80abfc900f659 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2464,12 +2464,6 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI, Value *Op) { return nullptr; } -static bool isMustTailCall(Value *V) { - if (auto *CI = dyn_cast(V)) - return CI->isMustTailCall(); - return false; -} - Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) { // Nothing for now. return nullptr; From 95661b9c7545b56e6ec5a0cfec75587f37a7ca50 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 30 May 2023 11:38:20 +0100 Subject: [PATCH 067/704] [X86] getTargetConstantBitsFromNode - support extracting fp data from ConstantDataSequential Fixes issue introduced by 0f8e0f4228805cbecce13dcfadef4c48a4f0f4cd where SimplifyDemandedBits could crash when trying to extract fp data from broadcasted constants --- llvm/lib/Target/X86/X86ISelLowering.cpp | 14 +++++++-- llvm/test/CodeGen/X86/avx-vbroadcast.ll | 42 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9b433280d5d9d..a89ab94c9e0d7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7453,9 +7453,19 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, if (auto *CDS = dyn_cast(Cst)) { Type *Ty = CDS->getType(); Mask = APInt::getZero(Ty->getPrimitiveSizeInBits()); - unsigned EltBits = CDS->getElementType()->getPrimitiveSizeInBits(); + Type *EltTy = CDS->getElementType(); + bool IsInteger = EltTy->isIntegerTy(); + bool IsFP = + EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy(); + if (!IsInteger && !IsFP) + return false; + unsigned EltBits = EltTy->getPrimitiveSizeInBits(); for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) - Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits); + if (IsInteger) + Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits); + else + Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(), + I * EltBits); return true; } return false; diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index 54bce767f1fcc..b442a6337e3b8 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -914,6 +914,48 @@ define double @broadcast_scale_xyz(ptr nocapture readonly, ptr nocapture readonl ret double %14 } +; +; Broadcast v2f32 non-uniform constant via vmovddup +; +define void @fmul_by_v2f32_broadcast() nounwind { +; X86-LABEL: fmul_by_v2f32_broadcast: +; X86: ## %bb.0: +; X86-NEXT: vmovddup {{.*#+}} xmm0 = [3.1E+1,0.0E+0,3.1E+1,0.0E+0] +; X86-NEXT: ## xmm0 = mem[0,0] +; X86-NEXT: ## implicit-def: $xmm1 +; X86-NEXT: .p2align 4, 0x90 +; X86-NEXT: LBB42_1: ## =>This Inner Loop Header: Depth=1 +; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-NEXT: vmulps %xmm0, %xmm2, %xmm2 +; X86-NEXT: vmovlps %xmm2, (%eax) +; X86-NEXT: vmulps %xmm0, %xmm1, %xmm1 +; X86-NEXT: vmovlps %xmm1, (%eax) +; X86-NEXT: jmp LBB42_1 +; +; X64-LABEL: fmul_by_v2f32_broadcast: +; X64: ## %bb.0: +; X64-NEXT: vmovddup {{.*#+}} xmm0 = [3.1E+1,0.0E+0,3.1E+1,0.0E+0] +; X64-NEXT: ## xmm0 = mem[0,0] +; X64-NEXT: ## implicit-def: $xmm1 +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: LBB42_1: ## =>This Inner Loop Header: Depth=1 +; X64-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X64-NEXT: vmulps %xmm0, %xmm2, %xmm2 +; X64-NEXT: vmovlps %xmm2, (%rax) +; X64-NEXT: vmulps %xmm0, %xmm1, %xmm1 +; X64-NEXT: vmovlps %xmm1, (%rax) +; X64-NEXT: jmp LBB42_1 + br label %1 +1: + %2 = phi <2 x float> [ undef, %0 ], [ %5, %1 ] + %3 = load <2 x float>, ptr poison, align 8 + %4 = fmul <2 x float> %3, + store <2 x float> %4, ptr poison, align 8 + %5 = fmul <2 x float> %2, + store <2 x float> %5, ptr poison, align 8 + br label %1 +} + ; ; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies. ; From 686b4d250b13bbda32c335d104fcb79874774104 Mon Sep 17 00:00:00 2001 From: Pierre Gousseau Date: Tue, 30 May 2023 10:12:54 +0000 Subject: [PATCH 068/704] Reland 2nd attempt: [tsan] Add debugging interfaces into interface header. Change __tsan_get_report_loc 6th argument 'size' to unsigned long * Reviewers: vitalybuka, dvyukov Differential Revision: https://reviews.llvm.org/D148214 --- .../include/sanitizer/tsan_interface.h | 117 ++++++++++++++++++ .../test/tsan/debug_mutex_bad_unlock.cpp | 77 ++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 compiler-rt/test/tsan/debug_mutex_bad_unlock.cpp diff --git a/compiler-rt/include/sanitizer/tsan_interface.h b/compiler-rt/include/sanitizer/tsan_interface.h index 58f2513734ecb..f19c79d79ba62 100644 --- a/compiler-rt/include/sanitizer/tsan_interface.h +++ b/compiler-rt/include/sanitizer/tsan_interface.h @@ -178,6 +178,123 @@ const char* __tsan_default_options(void); // User-provided default TSAN suppressions. const char* __tsan_default_suppressions(void); +/// Returns a report's description. +/// +/// Returns a report's description (issue type), number of duplicate issues +/// found, counts of array data (stack traces, memory operations, locations, +/// mutexes, threads, unique thread IDs) and a stack trace of a sleep() +/// call (if one was involved in the issue). +/// +/// \param report Opaque pointer to the current report. +/// \param[out] description Report type description. +/// \param[out] count Count of duplicate issues. +/// \param[out] stack_count Count of stack traces. +/// \param[out] mop_count Count of memory operations. +/// \param[out] loc_count Count of locations. +/// \param[out] mutex_count Count of mutexes. +/// \param[out] thread_count Count of threads. +/// \param[out] unique_tid_count Count of unique thread IDs. +/// \param sleep_trace A buffer to store the stack trace of a sleep() +/// call. +/// \param trace_size Size in bytes of the trace buffer. +/// \returns Returns 1 if successful, 0 if not. +int __tsan_get_report_data(void *report, const char **description, int *count, + int *stack_count, int *mop_count, int *loc_count, + int *mutex_count, int *thread_count, + int *unique_tid_count, void **sleep_trace, + unsigned long trace_size); + +/// Returns information about stack traces included in the report. +/// +/// \param report Opaque pointer to the current report. +/// \param idx Index to the report's stacks. +/// \param trace A buffer to store the stack trace. +/// \param trace_size Size in bytes of the trace buffer. +/// \returns Returns 1 if successful, 0 if not. +int __tsan_get_report_stack(void *report, unsigned long idx, void **trace, + unsigned long trace_size); + +/// Returns information about memory operations included in the report. +/// +/// \param report Opaque pointer to the current report. +/// \param idx Index to the report's memory operations. +/// \param[out] tid Thread ID of the memory operation. +/// \param[out] addr Address of the memory operation. +/// \param[out] size Size of the memory operation. +/// \param[out] write Write flag of the memory operation. +/// \param[out] atomic Atomicity flag of the memory operation. +/// \param trace A buffer to store the stack trace. +/// \param trace_size Size in bytes of the trace buffer. +/// \returns Returns 1 if successful, 0 if not. +int __tsan_get_report_mop(void *report, unsigned long idx, int *tid, + void **addr, int *size, int *write, int *atomic, + void **trace, unsigned long trace_size); + +/// Returns information about locations included in the report. +/// +/// \param report Opaque pointer to the current report. +/// \param idx Index to the report's locations. +/// \param[out] type Type of the location. +/// \param[out] addr Address of the location. +/// \param[out] start Start of the location. +/// \param[out] size Size of the location. +/// \param[out] tid Thread ID of the location. +/// \param[out] fd File descriptor of the location. +/// \param[out] suppressable Suppressable flag. +/// \param trace A buffer to store the stack trace. +/// \param trace_size Size in bytes of the trace buffer. +/// \returns Returns 1 if successful, 0 if not. +int __tsan_get_report_loc(void *report, unsigned long idx, const char **type, + void **addr, void **start, unsigned long *size, + int *tid, int *fd, int *suppressable, void **trace, + unsigned long trace_size); + +/// Returns information about mutexes included in the report. +/// +/// \param report Opaque pointer to the current report. +/// \param idx Index to the report's mutexes. +/// \param[out] mutex_id Id of the mutex. +/// \param[out] addr Address of the mutex. +/// \param[out] destroyed Destroyed mutex flag. +/// \param trace A buffer to store the stack trace. +/// \param trace_size Size in bytes of the trace buffer. +/// \returns Returns 1 if successful, 0 if not. +int __tsan_get_report_mutex(void *report, unsigned long idx, uint64_t *mutex_id, + void **addr, int *destroyed, void **trace, + unsigned long trace_size); + +/// Returns information about threads included in the report. +/// +/// \param report Opaque pointer to the current report. +/// \param idx Index to the report's threads. +/// \param[out] tid Thread ID of the thread. +/// \param[out] os_id Operating system's ID of the thread. +/// \param[out] running Running flag of the thread. +/// \param[out] name Name of the thread. +/// \param[out] parent_tid ID of the parent thread. +/// \param trace A buffer to store the stack trace. +/// \param trace_size Size in bytes of the trace buffer. +/// \returns Returns 1 if successful, 0 if not. +int __tsan_get_report_thread(void *report, unsigned long idx, int *tid, + uint64_t *os_id, int *running, const char **name, + int *parent_tid, void **trace, + unsigned long trace_size); + +/// Returns information about unique thread IDs included in the report. +/// +/// \param report Opaque pointer to the current report. +/// \param idx Index to the report's unique thread IDs. +/// \param[out] tid Unique thread ID of the report. +/// \returns Returns 1 if successful, 0 if not. +int __tsan_get_report_unique_tid(void *report, unsigned long idx, int *tid); + +/// Returns the current report. +/// +/// If TSan is currently reporting a detected issue on the current thread, +/// returns an opaque pointer to the current report. Otherwise returns NULL. +/// \returns An opaque pointer to the current report. Otherwise returns NULL. +void *__tsan_get_current_report(); + #ifdef __cplusplus } // extern "C" #endif diff --git a/compiler-rt/test/tsan/debug_mutex_bad_unlock.cpp b/compiler-rt/test/tsan/debug_mutex_bad_unlock.cpp new file mode 100644 index 0000000000000..3098f79391c2e --- /dev/null +++ b/compiler-rt/test/tsan/debug_mutex_bad_unlock.cpp @@ -0,0 +1,77 @@ +// RUN: %clangxx_tsan -O1 %s -o %t && %deflake %run %t | FileCheck %s + +#include "test.h" + +extern "C" { +void __tsan_on_report(void *report); +void *__tsan_get_current_report(); +int __tsan_get_report_data(void *report, const char **description, int *count, + int *stack_count, int *mop_count, int *loc_count, + int *mutex_count, int *thread_count, + int *unique_tid_count, void **sleep_trace, + unsigned long trace_size); +int __tsan_get_report_stack(void *report, unsigned long idx, void **trace, + unsigned long trace_size); +int __tsan_get_report_mutex(void *report, unsigned long idx, uint64_t *mutex_id, + void **addr, int *destroyed, void **trace, + unsigned long trace_size); +} + +int main() { + int m = 0; + fprintf(stderr, "&m = %p\n", &m); + // CHECK: &m = [[MUTEX:0x[0-9a-f]+]] + AnnotateRWLockReleased(__FILE__, __LINE__, &m, 1); + fprintf(stderr, "Done.\n"); + return 0; +} + +// Required for dyld macOS 12.0+ +#if (__APPLE__) +__attribute__((weak)) +#endif +__attribute__((disable_sanitizer_instrumentation)) extern "C" void +__tsan_on_report(void *report) { + fprintf(stderr, "__tsan_on_report(%p)\n", report); + fprintf(stderr, "__tsan_get_current_report() = %p\n", + __tsan_get_current_report()); + // CHECK: __tsan_on_report([[REPORT:0x[0-9a-f]+]]) + // CHECK: __tsan_get_current_report() = [[REPORT]] + + const char *description; + int count; + int stack_count, mop_count, loc_count, mutex_count, thread_count, + unique_tid_count; + void *sleep_trace[16] = {0}; + __tsan_get_report_data(report, &description, &count, &stack_count, &mop_count, + &loc_count, &mutex_count, &thread_count, + &unique_tid_count, sleep_trace, 16); + + fprintf(stderr, "stack_count = %d\n", stack_count); + // CHECK: stack_count = 1 + + fprintf(stderr, "mutex_count = %d\n", mutex_count); + // CHECK: mutex_count = 1 + + void *trace[16] = {0}; + __tsan_get_report_stack(report, 0, trace, 16); + + fprintf(stderr, "trace[0] = %p, trace[1] = %p, trace[2] = %p\n", trace[0], + trace[1], trace[2]); + // CHECK: trace[0] = 0x{{[0-9a-f]+}}, trace[1] = 0x{{[0-9a-f]+}}, trace[2] = + // {{0x0|\(nil\)|\(null\)}} + + uint64_t mutex_id; + void *addr; + int destroyed; + __tsan_get_report_mutex(report, 0, &mutex_id, &addr, &destroyed, trace, 16); + fprintf(stderr, "addr = %p, destroyed = %d\n", addr, destroyed); + // CHECK: addr = [[MUTEX]], destroyed = 0 + fprintf(stderr, "trace[0] = %p, trace[1] = %p, trace[2] = %p\n", trace[0], + trace[1], trace[2]); + // CHECK: trace[0] = 0x{{[0-9a-f]+}}, trace[1] = 0x{{[0-9a-f]+}}, trace[2] = + // {{0x0|\(nil\)|\(null\)}} +} + +// CHECK: Done. +// CHECK: ThreadSanitizer: reported 1 warnings From b75086210774a05181b8f313ba441ccc920d5d9e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 30 May 2023 12:14:11 +0100 Subject: [PATCH 069/704] [LV] Use early exit for stores storing the ptr operand. (NFC) Cleanup suggested in D150991. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0bf3b97161f92..0a99ccb5df373 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4701,13 +4701,9 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { // I, I is known to not require scalarization, and the pointer is not also // stored. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { - auto GetStoredValue = [I]() -> Value * { - if (!isa(I)) - return nullptr; - return I->getOperand(0); - }; - return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF) && - GetStoredValue() != Ptr; + if (isa(I) && I->getOperand(0) == Ptr) + return false; + return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); }; // Holds a list of values which are known to have at least one uniform use. From 1ef0bafc4f6f03805558b900d703292fb957c100 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 30 May 2023 06:49:14 -0500 Subject: [PATCH 070/704] [libc][NFC] Move the Linux file implementation to a subdirectory This patch simply moves the special handling for `linux` files to a subdirectory. This is done to make it easier in the future to extend this support to targets (like the GPU) that will have different dependencies. Reviewed By: lntue Differential Revision: https://reviews.llvm.org/D151231 --- libc/src/__support/File/CMakeLists.txt | 51 +++++++++---------- libc/src/__support/File/linux/CMakeLists.txt | 27 ++++++++++ .../File/{linux_dir.cpp => linux/dir.cpp} | 2 +- .../File/{linux_file.cpp => linux/file.cpp} | 2 +- 4 files changed, 52 insertions(+), 30 deletions(-) create mode 100644 libc/src/__support/File/linux/CMakeLists.txt rename libc/src/__support/File/{linux_dir.cpp => linux/dir.cpp} (97%) rename libc/src/__support/File/{linux_file.cpp => linux/file.cpp} (99%) diff --git a/libc/src/__support/File/CMakeLists.txt b/libc/src/__support/File/CMakeLists.txt index 79de9250c642b..53b2171deb6d7 100644 --- a/libc/src/__support/File/CMakeLists.txt +++ b/libc/src/__support/File/CMakeLists.txt @@ -29,34 +29,29 @@ add_object_library( libc.src.__support.threads.mutex ) -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}_file.cpp) - add_object_library( - platform_file - SRCS - ${LIBC_TARGET_OS}_file.cpp - DEPENDS - .file - libc.include.fcntl - libc.include.stdio - libc.include.sys_syscall - libc.src.__support.CPP.new - libc.src.__support.OSUtil.osutil - libc.src.errno.errno - libc.src.__support.error_or - ) +if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) + return() endif() -if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}_dir.cpp) - add_object_library( - platform_dir - SRCS - ${LIBC_TARGET_OS}_dir.cpp - DEPENDS - .dir - libc.include.fcntl - libc.include.sys_syscall - libc.src.__support.OSUtil.osutil - libc.src.__support.error_or - libc.src.errno.errno - ) +add_subdirectory(${LIBC_TARGET_OS}) +set(target_file libc.src.__support.File.${LIBC_TARGET_OS}.${LIBC_TARGET_OS}_file) +set(target_dir libc.src.__support.File.${LIBC_TARGET_OS}.${LIBC_TARGET_OS}_dir) +if((NOT TARGET ${target_file}) OR (NOT TARGET ${target_dir})) + return() endif() + +add_object_library( + platform_file + ALIAS + ${target_file} + DEPENDS + ${target_file} +) + +add_object_library( + platform_dir + ALIAS + ${target_dir} + DEPENDS + ${target_dir} +) diff --git a/libc/src/__support/File/linux/CMakeLists.txt b/libc/src/__support/File/linux/CMakeLists.txt new file mode 100644 index 0000000000000..c2ba66462ca37 --- /dev/null +++ b/libc/src/__support/File/linux/CMakeLists.txt @@ -0,0 +1,27 @@ +add_object_library( + linux_file + SRCS + file.cpp + DEPENDS + libc.include.fcntl + libc.include.stdio + libc.include.sys_syscall + libc.src.__support.CPP.new + libc.src.__support.OSUtil.osutil + libc.src.errno.errno + libc.src.__support.error_or + libc.src.__support.File.file +) + +add_object_library( + linux_dir + SRCS + dir.cpp + DEPENDS + libc.include.fcntl + libc.include.sys_syscall + libc.src.__support.OSUtil.osutil + libc.src.__support.error_or + libc.src.errno.errno + libc.src.__support.File.dir +) diff --git a/libc/src/__support/File/linux_dir.cpp b/libc/src/__support/File/linux/dir.cpp similarity index 97% rename from libc/src/__support/File/linux_dir.cpp rename to libc/src/__support/File/linux/dir.cpp index aae565ffb337a..4521a9bb44000 100644 --- a/libc/src/__support/File/linux_dir.cpp +++ b/libc/src/__support/File/linux/dir.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "dir.h" +#include "src/__support/File/dir.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/error_or.h" diff --git a/libc/src/__support/File/linux_file.cpp b/libc/src/__support/File/linux/file.cpp similarity index 99% rename from libc/src/__support/File/linux_file.cpp rename to libc/src/__support/File/linux/file.cpp index 001aa0cef9802..02746eeedc0bf 100644 --- a/libc/src/__support/File/linux_file.cpp +++ b/libc/src/__support/File/linux/file.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "file.h" +#include "src/__support/File/file.h" #include "src/__support/CPP/new.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. From 48339d0fbbdb2ba4610173ad1e792e9c2272c887 Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Mon, 17 Apr 2023 18:03:09 +0000 Subject: [PATCH 071/704] [CodeGen] Add pre-commit tests for D148558 This patch adds four new tests for upcoming functionality in LLVM: * complex-deinterleaving-add-mull-fixed-contract.ll * complex-deinterleaving-add-mull-scalable-contract.ll * complex-deinterleaving-add-mull-fixed-fast.ll * complex-deinterleaving-add-mull-scalable-fast.ll. These tests were generated from the IR of vectorizable loops, which were compiled from C++ code using different optimization flags in Clang. Each pair of tests corresponds to Neon and SVE architectures, respectively, and each pair contains tests compiled with -Ofast and -O3 -ffp-contract=fast -ffinite-math-only optimization flags. The tests were stripped of nnan and ninf flags as they have no impact on the output. The primary objective of these tests is to show the various sequences of complex computations that may be encountered and to demonstrate the ability of ComplexDeinterleaving to support any ordering. Depends on D147451 Differential Revision: https://reviews.llvm.org/D148550 --- ...-deinterleaving-add-mull-fixed-contract.ll | 248 ++++++++++++++++ ...plex-deinterleaving-add-mull-fixed-fast.ll | 245 ++++++++++++++++ ...interleaving-add-mull-scalable-contract.ll | 273 ++++++++++++++++++ ...x-deinterleaving-add-mull-scalable-fast.ll | 273 ++++++++++++++++++ .../complex-deinterleaving-multiuses.ll | 104 +++++++ 5 files changed, 1143 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll new file mode 100644 index 0000000000000..1c5f713b0e330 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-contract.ll @@ -0,0 +1,248 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; a * b + c +define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) { +; CHECK-LABEL: mull_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v4.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v5.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmul v1.2d, v5.2d, v4.2d +; CHECK-NEXT: fmul v3.2d, v0.2d, v4.2d +; CHECK-NEXT: fneg v1.2d, v1.2d +; CHECK-NEXT: fmla v3.2d, v2.2d, v5.2d +; CHECK-NEXT: fmla v1.2d, v2.2d, v0.2d +; CHECK-NEXT: fadd v3.2d, v3.2d, v4.2d +; CHECK-NEXT: fadd v1.2d, v2.2d, v1.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v3.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v3.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec28 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec30 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec31 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec31 + %1 = fmul contract <2 x double> %strided.vec28, %strided.vec30 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec30 + %4 = fmul contract <2 x double> %strided.vec28, %strided.vec31 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec33 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec34 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %6 = fadd contract <2 x double> %strided.vec33, %5 + %7 = fadd contract <2 x double> %2, %strided.vec34 + %interleaved.vec = shufflevector <2 x double> %6, <2 x double> %7, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b + c * d +define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_add_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #90 +; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #90 +; CHECK-NEXT: fadd v0.2d, v16.2d, v18.2d +; CHECK-NEXT: fadd v1.2d, v17.2d, v19.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec51 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec53 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec54 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec54 + %1 = fmul contract <2 x double> %strided.vec51, %strided.vec53 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec53 + %4 = fmul contract <2 x double> %strided.vec51, %strided.vec54 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec56 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec57 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec60 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %6 = fmul contract <2 x double> %strided.vec56, %strided.vec60 + %7 = fmul contract <2 x double> %strided.vec57, %strided.vec59 + %8 = fadd contract <2 x double> %7, %6 + %9 = fmul contract <2 x double> %strided.vec56, %strided.vec59 + %10 = fmul contract <2 x double> %strided.vec57, %strided.vec60 + %11 = fsub contract <2 x double> %9, %10 + %12 = fadd contract <2 x double> %5, %11 + %13 = fadd contract <2 x double> %2, %8 + %interleaved.vec = shufflevector <2 x double> %12, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b - c * d +define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_sub_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #90 +; CHECK-NEXT: fcmla v18.2d, v4.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v19.2d, v5.2d, v7.2d, #90 +; CHECK-NEXT: fsub v0.2d, v16.2d, v18.2d +; CHECK-NEXT: fsub v1.2d, v17.2d, v19.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec51 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec53 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec54 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec54 + %1 = fmul contract <2 x double> %strided.vec51, %strided.vec53 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec53 + %4 = fmul contract <2 x double> %strided.vec51, %strided.vec54 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec56 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec57 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec60 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %6 = fmul contract <2 x double> %strided.vec56, %strided.vec60 + %7 = fmul contract <2 x double> %strided.vec57, %strided.vec59 + %8 = fadd contract <2 x double> %7, %6 + %9 = fmul contract <2 x double> %strided.vec56, %strided.vec59 + %10 = fmul contract <2 x double> %strided.vec57, %strided.vec60 + %11 = fsub contract <2 x double> %9, %10 + %12 = fsub contract <2 x double> %5, %11 + %13 = fsub contract <2 x double> %2, %8 + %interleaved.vec = shufflevector <2 x double> %12, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b + conj(c) * d +define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_conj_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #0 +; CHECK-NEXT: fcmla v18.2d, v6.2d, v4.2d, #0 +; CHECK-NEXT: fcmla v19.2d, v7.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v0.2d, v2.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v1.2d, v3.2d, #90 +; CHECK-NEXT: fcmla v18.2d, v6.2d, v4.2d, #270 +; CHECK-NEXT: fcmla v19.2d, v7.2d, v5.2d, #270 +; CHECK-NEXT: fadd v0.2d, v16.2d, v18.2d +; CHECK-NEXT: fadd v1.2d, v17.2d, v19.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec61 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec62 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec62 + %1 = fmul contract <2 x double> %strided.vec59, %strided.vec61 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec61 + %4 = fmul contract <2 x double> %strided.vec59, %strided.vec62 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec64 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec65 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec67 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec68 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %6 = fmul contract <2 x double> %strided.vec64, %strided.vec68 + %7 = fmul contract <2 x double> %strided.vec65, %strided.vec67 + %8 = fsub contract <2 x double> %6, %7 + %9 = fmul contract <2 x double> %strided.vec64, %strided.vec67 + %10 = fmul contract <2 x double> %strided.vec65, %strided.vec68 + %11 = fadd contract <2 x double> %9, %10 + %12 = fadd contract <2 x double> %5, %11 + %13 = fadd contract <2 x double> %2, %8 + %interleaved.vec = shufflevector <2 x double> %12, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a + b + 1i * c * d +define <4 x double> @mul_add_rot_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_add_rot_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v16.2d, #0xffffffffffffffff +; CHECK-NEXT: zip2 v17.2d, v4.2d, v5.2d +; CHECK-NEXT: movi v18.2d, #0000000000000000 +; CHECK-NEXT: zip1 v19.2d, v0.2d, v1.2d +; CHECK-NEXT: fneg v16.2d, v16.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v5.2d, v2.2d, v3.2d +; CHECK-NEXT: mov v4.16b, v16.16b +; CHECK-NEXT: bsl v4.16b, v18.16b, v17.16b +; CHECK-NEXT: zip2 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: mov v3.16b, v16.16b +; CHECK-NEXT: bsl v3.16b, v18.16b, v1.16b +; CHECK-NEXT: fadd v1.2d, v1.2d, v4.2d +; CHECK-NEXT: zip2 v4.2d, v6.2d, v7.2d +; CHECK-NEXT: zip1 v6.2d, v6.2d, v7.2d +; CHECK-NEXT: fmul v7.2d, v0.2d, v2.2d +; CHECK-NEXT: fsub v3.2d, v3.2d, v17.2d +; CHECK-NEXT: fmul v16.2d, v1.2d, v4.2d +; CHECK-NEXT: fmul v2.2d, v19.2d, v2.2d +; CHECK-NEXT: fneg v7.2d, v7.2d +; CHECK-NEXT: fmul v4.2d, v3.2d, v4.2d +; CHECK-NEXT: fneg v16.2d, v16.2d +; CHECK-NEXT: fmla v2.2d, v5.2d, v0.2d +; CHECK-NEXT: fmla v7.2d, v5.2d, v19.2d +; CHECK-NEXT: fmla v4.2d, v1.2d, v6.2d +; CHECK-NEXT: fmla v16.2d, v6.2d, v3.2d +; CHECK-NEXT: fadd v1.2d, v2.2d, v4.2d +; CHECK-NEXT: fadd v2.2d, v7.2d, v16.2d +; CHECK-NEXT: zip1 v0.2d, v2.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v1.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec77 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec79 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec80 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul contract <2 x double> %strided.vec, %strided.vec80 + %1 = fmul contract <2 x double> %strided.vec77, %strided.vec79 + %2 = fadd contract <2 x double> %1, %0 + %3 = fmul contract <2 x double> %strided.vec, %strided.vec79 + %4 = fmul contract <2 x double> %strided.vec77, %strided.vec80 + %5 = fsub contract <2 x double> %3, %4 + %strided.vec82 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec83 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %6 = tail call contract <2 x double> @llvm.copysign.v2f64(<2 x double> zeroinitializer, <2 x double> %strided.vec83) + %7 = fadd contract <2 x double> %strided.vec82, %6 + %8 = tail call contract <2 x double> @llvm.copysign.v2f64(<2 x double> zeroinitializer, <2 x double> %strided.vec82) + %9 = fsub contract <2 x double> %8, %strided.vec83 + %strided.vec85 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec86 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %10 = fmul contract <2 x double> %9, %strided.vec86 + %11 = fmul contract <2 x double> %strided.vec85, %7 + %12 = fadd contract <2 x double> %11, %10 + %13 = fmul contract <2 x double> %9, %strided.vec85 + %14 = fmul contract <2 x double> %7, %strided.vec86 + %15 = fsub contract <2 x double> %13, %14 + %16 = fadd contract <2 x double> %5, %15 + %17 = fadd contract <2 x double> %2, %12 + %interleaved.vec = shufflevector <2 x double> %16, <2 x double> %17, <4 x i32> + ret <4 x double> %interleaved.vec +} + +declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll new file mode 100644 index 0000000000000..577c3ce8d95e1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+complxnum,+neon -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; a * b + c +define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) { +; CHECK-LABEL: mull_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v6.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v7.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v4.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmla v6.2d, v0.2d, v4.2d +; CHECK-NEXT: fmla v1.2d, v7.2d, v4.2d +; CHECK-NEXT: fmla v6.2d, v7.2d, v2.2d +; CHECK-NEXT: fmls v1.2d, v0.2d, v2.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v6.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v6.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec28 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec30 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec31 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec31, %strided.vec + %1 = fmul fast <2 x double> %strided.vec30, %strided.vec28 + %2 = fadd fast <2 x double> %0, %1 + %3 = fmul fast <2 x double> %strided.vec30, %strided.vec + %strided.vec33 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec34 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %4 = fadd fast <2 x double> %strided.vec33, %3 + %5 = fmul fast <2 x double> %strided.vec31, %strided.vec28 + %6 = fsub fast <2 x double> %4, %5 + %7 = fadd fast <2 x double> %2, %strided.vec34 + %interleaved.vec = shufflevector <2 x double> %6, <2 x double> %7, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b + c * d +define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_add_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1 v16.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v17.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v2.2d, v4.2d, v5.2d +; CHECK-NEXT: zip2 v3.2d, v4.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v16.2d, v0.2d +; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d +; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d +; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmul v7.2d, v16.2d, v17.2d +; CHECK-NEXT: fmla v4.2d, v17.2d, v1.2d +; CHECK-NEXT: fmla v0.2d, v3.2d, v6.2d +; CHECK-NEXT: fmla v7.2d, v2.2d, v5.2d +; CHECK-NEXT: fmla v4.2d, v3.2d, v5.2d +; CHECK-NEXT: fsub v1.2d, v7.2d, v0.2d +; CHECK-NEXT: fmla v4.2d, v2.2d, v6.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v4.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v4.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec51 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec53 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec54 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec54, %strided.vec + %1 = fmul fast <2 x double> %strided.vec53, %strided.vec51 + %2 = fmul fast <2 x double> %strided.vec53, %strided.vec + %3 = fmul fast <2 x double> %strided.vec54, %strided.vec51 + %strided.vec56 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec57 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec60 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %4 = fmul fast <2 x double> %strided.vec60, %strided.vec56 + %5 = fmul fast <2 x double> %strided.vec59, %strided.vec57 + %6 = fmul fast <2 x double> %strided.vec59, %strided.vec56 + %7 = fmul fast <2 x double> %strided.vec60, %strided.vec57 + %8 = fadd fast <2 x double> %7, %3 + %9 = fadd fast <2 x double> %6, %2 + %10 = fsub fast <2 x double> %9, %8 + %11 = fadd fast <2 x double> %0, %1 + %12 = fadd fast <2 x double> %11, %5 + %13 = fadd fast <2 x double> %12, %4 + %interleaved.vec = shufflevector <2 x double> %10, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b - c * d +define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_sub_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v2.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v3.2d, v6.2d, v7.2d +; CHECK-NEXT: zip1 v16.2d, v4.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v17.2d, v0.2d +; CHECK-NEXT: fmul v5.2d, v17.2d, v18.2d +; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d +; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d +; CHECK-NEXT: fmul v7.2d, v3.2d, v2.2d +; CHECK-NEXT: fmla v4.2d, v18.2d, v1.2d +; CHECK-NEXT: fmla v0.2d, v16.2d, v3.2d +; CHECK-NEXT: fmla v5.2d, v2.2d, v6.2d +; CHECK-NEXT: fmla v7.2d, v16.2d, v6.2d +; CHECK-NEXT: fsub v1.2d, v5.2d, v0.2d +; CHECK-NEXT: fsub v2.2d, v4.2d, v7.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v2.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec53 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec55 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec56 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec56, %strided.vec + %1 = fmul fast <2 x double> %strided.vec55, %strided.vec53 + %2 = fmul fast <2 x double> %strided.vec55, %strided.vec + %3 = fmul fast <2 x double> %strided.vec56, %strided.vec53 + %strided.vec58 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec61 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec62 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %4 = fmul fast <2 x double> %strided.vec62, %strided.vec59 + %5 = fmul fast <2 x double> %strided.vec61, %strided.vec58 + %6 = fadd fast <2 x double> %5, %3 + %7 = fadd fast <2 x double> %4, %2 + %8 = fsub fast <2 x double> %7, %6 + %9 = fmul fast <2 x double> %strided.vec61, %strided.vec59 + %10 = fmul fast <2 x double> %strided.vec62, %strided.vec58 + %11 = fadd fast <2 x double> %10, %9 + %12 = fadd fast <2 x double> %0, %1 + %13 = fsub fast <2 x double> %12, %11 + %interleaved.vec = shufflevector <2 x double> %8, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a * b + conj(c) * d +define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_conj_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v16.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v17.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: fmul v3.2d, v16.2d, v17.2d +; CHECK-NEXT: fmul v1.2d, v2.2d, v17.2d +; CHECK-NEXT: zip1 v17.2d, v4.2d, v5.2d +; CHECK-NEXT: zip2 v4.2d, v4.2d, v5.2d +; CHECK-NEXT: fneg v3.2d, v3.2d +; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d +; CHECK-NEXT: fmla v1.2d, v0.2d, v16.2d +; CHECK-NEXT: fmla v3.2d, v0.2d, v2.2d +; CHECK-NEXT: zip2 v0.2d, v6.2d, v7.2d +; CHECK-NEXT: fmls v1.2d, v4.2d, v5.2d +; CHECK-NEXT: fmla v3.2d, v17.2d, v5.2d +; CHECK-NEXT: fmla v1.2d, v17.2d, v0.2d +; CHECK-NEXT: fmla v3.2d, v4.2d, v0.2d +; CHECK-NEXT: zip1 v0.2d, v3.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v3.2d, v1.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec61 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec62 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec62, %strided.vec + %1 = fmul fast <2 x double> %strided.vec61, %strided.vec59 + %2 = fmul fast <2 x double> %strided.vec61, %strided.vec + %strided.vec64 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec65 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec67 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec68 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %3 = fmul fast <2 x double> %strided.vec68, %strided.vec64 + %4 = fmul fast <2 x double> %strided.vec67, %strided.vec64 + %5 = fmul fast <2 x double> %strided.vec68, %strided.vec65 + %6 = fmul fast <2 x double> %strided.vec62, %strided.vec59 + %7 = fsub fast <2 x double> %2, %6 + %8 = fadd fast <2 x double> %7, %4 + %9 = fadd fast <2 x double> %8, %5 + %10 = fadd fast <2 x double> %0, %1 + %11 = fmul fast <2 x double> %strided.vec67, %strided.vec65 + %12 = fsub fast <2 x double> %10, %11 + %13 = fadd fast <2 x double> %12, %3 + %interleaved.vec = shufflevector <2 x double> %9, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; a + b + 1i * c * d +define <4 x double> @mul_add_rot_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { +; CHECK-LABEL: mul_add_rot_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip2 v16.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v17.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: zip2 v3.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v18.2d, v6.2d, v7.2d +; CHECK-NEXT: fmul v19.2d, v16.2d, v17.2d +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d +; CHECK-NEXT: fmul v4.2d, v2.2d, v17.2d +; CHECK-NEXT: zip2 v5.2d, v6.2d, v7.2d +; CHECK-NEXT: fmla v19.2d, v3.2d, v18.2d +; CHECK-NEXT: fmla v4.2d, v0.2d, v16.2d +; CHECK-NEXT: fmla v19.2d, v1.2d, v5.2d +; CHECK-NEXT: fmla v4.2d, v1.2d, v18.2d +; CHECK-NEXT: fneg v1.2d, v19.2d +; CHECK-NEXT: fmls v4.2d, v3.2d, v5.2d +; CHECK-NEXT: fmla v1.2d, v0.2d, v2.2d +; CHECK-NEXT: zip1 v0.2d, v1.2d, v4.2d +; CHECK-NEXT: zip2 v1.2d, v1.2d, v4.2d +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec79 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec81 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec82 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec82, %strided.vec + %1 = fmul fast <2 x double> %strided.vec81, %strided.vec79 + %2 = fmul fast <2 x double> %strided.vec81, %strided.vec + %3 = fmul fast <2 x double> %strided.vec82, %strided.vec79 + %strided.vec84 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec85 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec87 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec88 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %4 = fmul fast <2 x double> %strided.vec87, %strided.vec84 + %5 = fmul fast <2 x double> %strided.vec87, %strided.vec85 + %6 = fmul fast <2 x double> %strided.vec88, %strided.vec84 + %7 = fadd fast <2 x double> %5, %3 + %8 = fadd fast <2 x double> %7, %6 + %9 = fsub fast <2 x double> %2, %8 + %10 = fadd fast <2 x double> %0, %1 + %11 = fadd fast <2 x double> %10, %4 + %12 = fmul fast <2 x double> %strided.vec88, %strided.vec85 + %13 = fsub fast <2 x double> %11, %12 + %interleaved.vec = shufflevector <2 x double> %9, <2 x double> %13, <4 x i32> + ret <4 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll new file mode 100644 index 0000000000000..79ffe693fe311 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll @@ -0,0 +1,273 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; a * b + c +define @mull_add( %a, %b, %c) { +; CHECK-LABEL: mull_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 z6.d, z2.d, z3.d +; CHECK-NEXT: uzp2 z7.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z0.d, z6.d +; CHECK-NEXT: fmla z2.d, p0/m, z7.d, z1.d +; CHECK-NEXT: fmul z3.d, z7.d, z6.d +; CHECK-NEXT: fnmsb z0.d, p0/m, z1.d, z3.d +; CHECK-NEXT: uzp2 z1.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z3.d, z4.d, z5.d +; CHECK-NEXT: fadd z3.d, z3.d, z0.d +; CHECK-NEXT: fadd z1.d, z2.d, z1.d +; CHECK-NEXT: zip1 z0.d, z3.d, z1.d +; CHECK-NEXT: zip2 z1.d, z3.d, z1.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec29 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec29, 0 + %3 = extractvalue { , } %strided.vec29, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec31 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec31, 0 + %11 = extractvalue { , } %strided.vec31, 1 + %12 = fadd contract %10, %9 + %13 = fadd contract %6, %11 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %12, %13) + ret %interleaved.vec +} + +; a * b + c * d +define @mul_add_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_add_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 +; CHECK-NEXT: fadd z0.d, z25.d, z27.d +; CHECK-NEXT: fadd z1.d, z26.d, z24.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec52 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec52, 0 + %3 = extractvalue { , } %strided.vec52, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec54 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec54, 0 + %11 = extractvalue { , } %strided.vec54, 1 + %strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %12 = extractvalue { , } %strided.vec56, 0 + %13 = extractvalue { , } %strided.vec56, 1 + %14 = fmul contract %10, %13 + %15 = fmul contract %11, %12 + %16 = fadd contract %15, %14 + %17 = fmul contract %10, %12 + %18 = fmul contract %11, %13 + %19 = fsub contract %17, %18 + %20 = fadd contract %9, %19 + %21 = fadd contract %6, %16 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %20, %21) + ret %interleaved.vec +} + +; a * b - c * d +define @mul_sub_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_sub_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 +; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 +; CHECK-NEXT: fsub z0.d, z25.d, z27.d +; CHECK-NEXT: fsub z1.d, z26.d, z24.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec52 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec52, 0 + %3 = extractvalue { , } %strided.vec52, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec54 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec54, 0 + %11 = extractvalue { , } %strided.vec54, 1 + %strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %12 = extractvalue { , } %strided.vec56, 0 + %13 = extractvalue { , } %strided.vec56, 1 + %14 = fmul contract %10, %13 + %15 = fmul contract %11, %12 + %16 = fadd contract %15, %14 + %17 = fmul contract %10, %12 + %18 = fmul contract %11, %13 + %19 = fsub contract %17, %18 + %20 = fsub contract %9, %19 + %21 = fsub contract %6, %16 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %20, %21) + ret %interleaved.vec +} + +; a * b + conj(c) * d +define @mul_conj_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_conj_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: mov z26.d, z24.d +; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 +; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 +; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #270 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 +; CHECK-NEXT: fadd z0.d, z25.d, z27.d +; CHECK-NEXT: fadd z1.d, z26.d, z24.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec60 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec60, 0 + %3 = extractvalue { , } %strided.vec60, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec62 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec62, 0 + %11 = extractvalue { , } %strided.vec62, 1 + %strided.vec64 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %12 = extractvalue { , } %strided.vec64, 0 + %13 = extractvalue { , } %strided.vec64, 1 + %14 = fmul contract %10, %13 + %15 = fmul contract %11, %12 + %16 = fsub contract %14, %15 + %17 = fmul contract %10, %12 + %18 = fmul contract %11, %13 + %19 = fadd contract %17, %18 + %20 = fadd contract %9, %19 + %21 = fadd contract %6, %16 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %20, %21) + ret %interleaved.vec +} + +; a + b + 1i * c * d +define @mul_add_rot_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_add_rot_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 z24.d, z4.d, z5.d +; CHECK-NEXT: mov z26.d, #0 // =0x0 +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: and z26.d, z26.d, #0x7fffffffffffffff +; CHECK-NEXT: and z25.d, z25.d, #0x8000000000000000 +; CHECK-NEXT: uzp2 z27.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z4.d, z5.d +; CHECK-NEXT: orr z5.d, z26.d, z25.d +; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d +; CHECK-NEXT: fadd z5.d, z1.d, z5.d +; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 +; CHECK-NEXT: orr z1.d, z26.d, z1.d +; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d +; CHECK-NEXT: fsub z1.d, z1.d, z24.d +; CHECK-NEXT: uzp2 z24.d, z6.d, z7.d +; CHECK-NEXT: fmul z3.d, z0.d, z2.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp1 z6.d, z6.d, z7.d +; CHECK-NEXT: fmul z7.d, z1.d, z24.d +; CHECK-NEXT: fmla z3.d, p0/m, z27.d, z4.d +; CHECK-NEXT: fmla z7.d, p0/m, z6.d, z5.d +; CHECK-NEXT: fmul z2.d, z27.d, z2.d +; CHECK-NEXT: fmul z5.d, z5.d, z24.d +; CHECK-NEXT: fnmsb z0.d, p0/m, z4.d, z2.d +; CHECK-NEXT: fnmsb z1.d, p0/m, z6.d, z5.d +; CHECK-NEXT: fadd z1.d, z0.d, z1.d +; CHECK-NEXT: fadd z2.d, z3.d, z7.d +; CHECK-NEXT: zip1 z0.d, z1.d, z2.d +; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec78 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec78, 0 + %3 = extractvalue { , } %strided.vec78, 1 + %4 = fmul contract %0, %3 + %5 = fmul contract %1, %2 + %6 = fadd contract %5, %4 + %7 = fmul contract %0, %2 + %8 = fmul contract %1, %3 + %9 = fsub contract %7, %8 + %strided.vec80 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %10 = extractvalue { , } %strided.vec80, 0 + %11 = extractvalue { , } %strided.vec80, 1 + %12 = tail call contract @llvm.copysign.nxv2f64( zeroinitializer, %11) + %13 = fadd contract %10, %12 + %14 = tail call contract @llvm.copysign.nxv2f64( zeroinitializer, %10) + %15 = fsub contract %14, %11 + %strided.vec82 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %16 = extractvalue { , } %strided.vec82, 0 + %17 = extractvalue { , } %strided.vec82, 1 + %18 = fmul contract %15, %17 + %19 = fmul contract %16, %13 + %20 = fadd contract %19, %18 + %21 = fmul contract %15, %16 + %22 = fmul contract %13, %17 + %23 = fsub contract %21, %22 + %24 = fadd contract %9, %23 + %25 = fadd contract %6, %20 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %24, %25) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) +declare @llvm.copysign.nxv2f64(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll new file mode 100644 index 0000000000000..f801a1bfd7e0a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll @@ -0,0 +1,273 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+sve -o - | FileCheck %s + +target triple = "aarch64-arm-none-eabi" + +; a * b + c +define @mull_add( %a, %b, %c) { +; CHECK-LABEL: mull_add: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 z6.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z7.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmla z1.d, p0/m, z4.d, z7.d +; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d +; CHECK-NEXT: movprfx z5, z6 +; CHECK-NEXT: fmla z5.d, p0/m, z4.d, z0.d +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: fmla z3.d, p0/m, z2.d, z7.d +; CHECK-NEXT: fmls z1.d, p0/m, z2.d, z0.d +; CHECK-NEXT: zip1 z0.d, z1.d, z3.d +; CHECK-NEXT: zip2 z1.d, z1.d, z3.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec29 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec29, 0 + %3 = extractvalue { , } %strided.vec29, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fadd fast %4, %5 + %7 = fmul fast %2, %0 + %strided.vec31 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %8 = extractvalue { , } %strided.vec31, 0 + %9 = extractvalue { , } %strided.vec31, 1 + %10 = fadd fast %8, %7 + %11 = fmul fast %3, %1 + %12 = fsub fast %10, %11 + %13 = fadd fast %6, %9 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %12, %13) + ret %interleaved.vec +} + +; a * b + c * d +define @mul_add_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_add_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d +; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z26.d, z6.d, z7.d +; CHECK-NEXT: fmul z1.d, z1.d, z25.d +; CHECK-NEXT: fmul z0.d, z24.d, z0.d +; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d +; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d +; CHECK-NEXT: fmla z1.d, p0/m, z26.d, z4.d +; CHECK-NEXT: fmla z2.d, p0/m, z26.d, z3.d +; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z3.d +; CHECK-NEXT: fmla z2.d, p0/m, z5.d, z4.d +; CHECK-NEXT: fsub z1.d, z1.d, z0.d +; CHECK-NEXT: zip1 z0.d, z1.d, z2.d +; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec52 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec52, 0 + %3 = extractvalue { , } %strided.vec52, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fmul fast %2, %0 + %7 = fmul fast %3, %1 + %strided.vec54 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %8 = extractvalue { , } %strided.vec54, 0 + %9 = extractvalue { , } %strided.vec54, 1 + %strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %10 = extractvalue { , } %strided.vec56, 0 + %11 = extractvalue { , } %strided.vec56, 1 + %12 = fmul fast %11, %8 + %13 = fmul fast %10, %9 + %14 = fmul fast %10, %8 + %15 = fmul fast %11, %9 + %16 = fadd fast %15, %7 + %17 = fadd fast %14, %6 + %18 = fsub fast %17, %16 + %19 = fadd fast %4, %5 + %20 = fadd fast %19, %13 + %21 = fadd fast %20, %12 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %18, %21) + ret %interleaved.vec +} + +; a * b - c * d +define @mul_sub_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_sub_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: fmul z1.d, z1.d, z25.d +; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d +; CHECK-NEXT: uzp2 z6.d, z6.d, z7.d +; CHECK-NEXT: fmul z0.d, z24.d, z0.d +; CHECK-NEXT: fmla z1.d, p0/m, z6.d, z3.d +; CHECK-NEXT: fmul z3.d, z5.d, z3.d +; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d +; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z4.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d +; CHECK-NEXT: fsub z1.d, z1.d, z0.d +; CHECK-NEXT: fsub z2.d, z2.d, z3.d +; CHECK-NEXT: zip1 z0.d, z1.d, z2.d +; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec54 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec54, 0 + %3 = extractvalue { , } %strided.vec54, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fmul fast %2, %0 + %7 = fmul fast %3, %1 + %strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %8 = extractvalue { , } %strided.vec56, 0 + %9 = extractvalue { , } %strided.vec56, 1 + %strided.vec58 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %10 = extractvalue { , } %strided.vec58, 0 + %11 = extractvalue { , } %strided.vec58, 1 + %12 = fmul fast %11, %9 + %13 = fmul fast %10, %8 + %14 = fadd fast %13, %7 + %15 = fadd fast %12, %6 + %16 = fsub fast %15, %14 + %17 = fmul fast %10, %9 + %18 = fmul fast %11, %8 + %19 = fadd fast %18, %17 + %20 = fadd fast %4, %5 + %21 = fsub fast %20, %19 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %16, %21) + ret %interleaved.vec +} + +; a * b + conj(c) * d +define @mul_conj_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_conj_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d +; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmul z0.d, z24.d, z0.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d +; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d +; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z25.d +; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fmls z1.d, p0/m, z5.d, z3.d +; CHECK-NEXT: uzp2 z2.d, z6.d, z7.d +; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z4.d +; CHECK-NEXT: fmad z3.d, p0/m, z2.d, z0.d +; CHECK-NEXT: zip1 z0.d, z3.d, z1.d +; CHECK-NEXT: zip2 z1.d, z3.d, z1.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec60 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec60, 0 + %3 = extractvalue { , } %strided.vec60, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fmul fast %2, %0 + %strided.vec62 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %7 = extractvalue { , } %strided.vec62, 0 + %8 = extractvalue { , } %strided.vec62, 1 + %strided.vec64 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %9 = extractvalue { , } %strided.vec64, 0 + %10 = extractvalue { , } %strided.vec64, 1 + %11 = fmul fast %10, %7 + %12 = fmul fast %9, %7 + %13 = fmul fast %10, %8 + %14 = fmul fast %3, %1 + %15 = fsub fast %6, %14 + %16 = fadd fast %15, %12 + %17 = fadd fast %16, %13 + %18 = fadd fast %4, %5 + %19 = fmul fast %9, %8 + %20 = fsub fast %18, %19 + %21 = fadd fast %20, %11 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %17, %21) + ret %interleaved.vec +} + +; a + b + 1i * c * d +define @mul_add_rot_mull( %a, %b, %c, %d) { +; CHECK-LABEL: mul_add_rot_mull: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d +; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d +; CHECK-NEXT: fmul z0.d, z24.d, z0.d +; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z24.d, z6.d, z7.d +; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d +; CHECK-NEXT: fmla z0.d, p0/m, z24.d, z3.d +; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d +; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z4.d +; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d +; CHECK-NEXT: fmls z2.d, p0/m, z5.d, z3.d +; CHECK-NEXT: fnmsb z1.d, p0/m, z25.d, z0.d +; CHECK-NEXT: zip1 z0.d, z1.d, z2.d +; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: ret +entry: + %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) + %0 = extractvalue { , } %strided.vec, 0 + %1 = extractvalue { , } %strided.vec, 1 + %strided.vec80 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %b) + %2 = extractvalue { , } %strided.vec80, 0 + %3 = extractvalue { , } %strided.vec80, 1 + %4 = fmul fast %3, %0 + %5 = fmul fast %2, %1 + %6 = fmul fast %2, %0 + %7 = fmul fast %3, %1 + %strided.vec82 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %c) + %8 = extractvalue { , } %strided.vec82, 0 + %9 = extractvalue { , } %strided.vec82, 1 + %strided.vec84 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %d) + %10 = extractvalue { , } %strided.vec84, 0 + %11 = extractvalue { , } %strided.vec84, 1 + %12 = fmul fast %10, %8 + %13 = fmul fast %10, %9 + %14 = fmul fast %11, %8 + %15 = fadd fast %13, %7 + %16 = fadd fast %15, %14 + %17 = fsub fast %6, %16 + %18 = fadd fast %4, %5 + %19 = fadd fast %18, %12 + %20 = fmul fast %11, %9 + %21 = fsub fast %19, %20 + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv4f64( %17, %21) + ret %interleaved.vec +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll index 4d84636e92ca2..9409bb9530e0e 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll @@ -293,3 +293,107 @@ entry: ret <4 x float> %interleaved.vec136 } +; Expected to transform. Shows that composite common subexpression is not generated twice. +; u[i] = a[i] * b[i] - (c[i] * d[i] + g[i] * h[i]); +; v[i] = e[i] * f[i] + (c[i] * d[i] + g[i] * h[i]); +define void @mul_add_common_mul_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e, <4 x double> %f, <4 x double> %g, <4 x double> %h, ptr %p1, ptr %p2) { +; CHECK-LABEL: mul_add_common_mul_add_mul: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp q17, q16, [sp, #96] +; CHECK-NEXT: zip2 v20.2d, v4.2d, v5.2d +; CHECK-NEXT: zip2 v21.2d, v6.2d, v7.2d +; CHECK-NEXT: zip1 v4.2d, v4.2d, v5.2d +; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d +; CHECK-NEXT: ldp q19, q18, [sp, #64] +; CHECK-NEXT: zip2 v23.2d, v17.2d, v16.2d +; CHECK-NEXT: fmul v6.2d, v21.2d, v20.2d +; CHECK-NEXT: zip1 v16.2d, v17.2d, v16.2d +; CHECK-NEXT: zip2 v22.2d, v19.2d, v18.2d +; CHECK-NEXT: zip1 v18.2d, v19.2d, v18.2d +; CHECK-NEXT: fneg v6.2d, v6.2d +; CHECK-NEXT: fmul v20.2d, v5.2d, v20.2d +; CHECK-NEXT: fmul v7.2d, v22.2d, v23.2d +; CHECK-NEXT: fmla v6.2d, v4.2d, v5.2d +; CHECK-NEXT: zip2 v5.2d, v2.2d, v3.2d +; CHECK-NEXT: fneg v7.2d, v7.2d +; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-NEXT: fmla v7.2d, v18.2d, v16.2d +; CHECK-NEXT: fadd v19.2d, v7.2d, v6.2d +; CHECK-NEXT: fmla v20.2d, v4.2d, v21.2d +; CHECK-NEXT: zip2 v4.2d, v0.2d, v1.2d +; CHECK-NEXT: ldp q7, q6, [sp] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: fmla v20.2d, v18.2d, v23.2d +; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d +; CHECK-NEXT: fmla v20.2d, v22.2d, v16.2d +; CHECK-NEXT: mov v3.16b, v19.16b +; CHECK-NEXT: fmla v1.2d, v0.2d, v5.2d +; CHECK-NEXT: fmla v3.2d, v4.2d, v5.2d +; CHECK-NEXT: ldp q16, q4, [sp, #32] +; CHECK-NEXT: fneg v17.2d, v3.2d +; CHECK-NEXT: zip1 v3.2d, v7.2d, v6.2d +; CHECK-NEXT: zip2 v6.2d, v7.2d, v6.2d +; CHECK-NEXT: zip1 v5.2d, v16.2d, v4.2d +; CHECK-NEXT: fmla v17.2d, v0.2d, v2.2d +; CHECK-NEXT: fsub v18.2d, v1.2d, v20.2d +; CHECK-NEXT: zip2 v0.2d, v16.2d, v4.2d +; CHECK-NEXT: fmla v19.2d, v3.2d, v5.2d +; CHECK-NEXT: st2 { v17.2d, v18.2d }, [x0] +; CHECK-NEXT: fmls v19.2d, v6.2d, v0.2d +; CHECK-NEXT: fmla v20.2d, v6.2d, v5.2d +; CHECK-NEXT: fmla v20.2d, v3.2d, v0.2d +; CHECK-NEXT: st2 { v19.2d, v20.2d }, [x1] +; CHECK-NEXT: ret +entry: + %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec123 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %strided.vec125 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %strided.vec126 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec125, %strided.vec + %1 = fmul fast <2 x double> %strided.vec126, %strided.vec + %2 = fmul fast <2 x double> %strided.vec125, %strided.vec123 + %3 = fadd fast <2 x double> %1, %2 + %strided.vec128 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec129 = shufflevector <4 x double> %c, <4 x double> poison, <2 x i32> + %strided.vec131 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %strided.vec132 = shufflevector <4 x double> %d, <4 x double> poison, <2 x i32> + %4 = fmul fast <2 x double> %strided.vec131, %strided.vec128 + %5 = fmul fast <2 x double> %strided.vec132, %strided.vec129 + %6 = fmul fast <2 x double> %strided.vec132, %strided.vec128 + %7 = fmul fast <2 x double> %strided.vec131, %strided.vec129 + %8 = fsub fast <2 x double> %4, %5 + %strided.vec134 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> + %strided.vec135 = shufflevector <4 x double> %g, <4 x double> poison, <2 x i32> + %strided.vec137 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> + %strided.vec138 = shufflevector <4 x double> %h, <4 x double> poison, <2 x i32> + %9 = fmul fast <2 x double> %strided.vec138, %strided.vec134 + %10 = fmul fast <2 x double> %strided.vec137, %strided.vec135 + %11 = fmul fast <2 x double> %strided.vec137, %strided.vec134 + %12 = fmul fast <2 x double> %strided.vec135, %strided.vec138 + %13 = fsub fast <2 x double> %11, %12 + %14 = fadd fast <2 x double> %13, %8 + %15 = fadd fast <2 x double> %6, %7 + %16 = fadd fast <2 x double> %15, %9 + %17 = fadd fast <2 x double> %16, %10 + %18 = fmul fast <2 x double> %strided.vec126, %strided.vec123 + %19 = fadd fast <2 x double> %18, %14 + %20 = fsub fast <2 x double> %0, %19 + %21 = fsub fast <2 x double> %3, %17 + %interleaved.vec = shufflevector <2 x double> %20, <2 x double> %21, <4 x i32> + store <4 x double> %interleaved.vec, ptr %p1, align 8 + %strided.vec140 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> + %strided.vec141 = shufflevector <4 x double> %e, <4 x double> poison, <2 x i32> + %strided.vec143 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> + %strided.vec144 = shufflevector <4 x double> %f, <4 x double> poison, <2 x i32> + %22 = fmul fast <2 x double> %strided.vec143, %strided.vec140 + %23 = fmul fast <2 x double> %strided.vec144, %strided.vec140 + %24 = fmul fast <2 x double> %strided.vec143, %strided.vec141 + %25 = fadd fast <2 x double> %22, %14 + %26 = fmul fast <2 x double> %strided.vec144, %strided.vec141 + %27 = fsub fast <2 x double> %25, %26 + %28 = fadd fast <2 x double> %24, %17 + %29 = fadd fast <2 x double> %28, %23 + %interleaved.vec145 = shufflevector <2 x double> %27, <2 x double> %29, <4 x i32> + store <4 x double> %interleaved.vec145, ptr %p2, align 8 + ret void +} From ab4b924832ce26c21b88d7f82fcf4992ea8906bb Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 29 May 2023 17:16:05 +0100 Subject: [PATCH 072/704] [X86] X86FixupVectorConstantsPass - attempt to replace full width integer vector constant loads with broadcasts on AVX2+ targets lowerBuildVectorAsBroadcast will not broadcast splat constants in all cases, resulting in a lot of situations where a full width vector load that has failed to fold but is loading splat constant values could use a broadcast load instruction just as cheaply, and save constant pool space. --- .../Target/X86/X86FixupVectorConstants.cpp | 42 + llvm/test/CodeGen/X86/abdu-vector-128.ll | 10 +- .../any_extend_vector_inreg_of_broadcast.ll | 20 +- ...d_vector_inreg_of_broadcast_from_memory.ll | 14 +- llvm/test/CodeGen/X86/avx2-arith.ll | 4 +- llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll | 30 +- llvm/test/CodeGen/X86/avx2-shift.ll | 4 +- llvm/test/CodeGen/X86/avx2-vector-shifts.ll | 6 +- llvm/test/CodeGen/X86/avx512-arith.ll | 6 +- .../X86/avx512-intrinsics-fast-isel.ll | 6 +- .../X86/avx512-shuffles/partial_permute.ll | 12 +- llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 10 +- .../X86/bitcast-int-to-vector-bool-zext.ll | 2 +- llvm/test/CodeGen/X86/bitcast-vector-bool.ll | 2 +- llvm/test/CodeGen/X86/combine-bitselect.ll | 15 +- llvm/test/CodeGen/X86/combine-pavg.ll | 38 +- llvm/test/CodeGen/X86/combine-sdiv.ll | 4 +- llvm/test/CodeGen/X86/combine-shl.ll | 3 +- llvm/test/CodeGen/X86/combine-smax.ll | 30 +- llvm/test/CodeGen/X86/combine-smin.ll | 30 +- llvm/test/CodeGen/X86/combine-sra.ll | 3 +- llvm/test/CodeGen/X86/combine-srl.ll | 6 +- llvm/test/CodeGen/X86/concat-cast.ll | 2 +- .../copy-low-subvec-elt-to-high-subvec-elt.ll | 12 +- llvm/test/CodeGen/X86/dpbusd_i4.ll | 4 +- llvm/test/CodeGen/X86/freeze-vector.ll | 4 +- llvm/test/CodeGen/X86/gfni-funnel-shifts.ll | 4 +- llvm/test/CodeGen/X86/gfni-rotates.ll | 4 +- llvm/test/CodeGen/X86/gfni-shifts.ll | 41 +- .../CodeGen/X86/horizontal-reduce-umax.ll | 2 +- .../CodeGen/X86/horizontal-reduce-umin.ll | 2 +- llvm/test/CodeGen/X86/icmp-pow2-diff.ll | 4 +- .../X86/insert-into-constant-vector.ll | 22 +- .../CodeGen/X86/machine-combiner-int-vec.ll | 4 +- llvm/test/CodeGen/X86/masked_store_trunc.ll | 4 +- .../CodeGen/X86/masked_store_trunc_ssat.ll | 173 +- .../CodeGen/X86/masked_store_trunc_usat.ll | 175 +- llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 104 +- llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 40 +- .../CodeGen/X86/min-legal-vector-width.ll | 4 +- llvm/test/CodeGen/X86/movmsk-cmp.ll | 4 +- llvm/test/CodeGen/X86/oddshuffles.ll | 6 +- llvm/test/CodeGen/X86/paddus.ll | 20 +- llvm/test/CodeGen/X86/pmaddubsw.ll | 4 +- llvm/test/CodeGen/X86/pmul.ll | 32 +- llvm/test/CodeGen/X86/pmulh.ll | 2 +- llvm/test/CodeGen/X86/pr37499.ll | 6 +- llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll | 3 +- llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll | 58 +- .../CodeGen/X86/prefer-avx256-wide-mul.ll | 4 +- llvm/test/CodeGen/X86/psubus.ll | 2 +- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 12 +- llvm/test/CodeGen/X86/sat-add.ll | 8 +- .../test/CodeGen/X86/setcc-non-simple-type.ll | 4 +- .../X86/shuffle-strided-with-offset-256.ll | 351 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 177 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 8 +- llvm/test/CodeGen/X86/slow-pmulld.ll | 6 +- .../CodeGen/X86/srem-seteq-vec-nonsplat.ll | 4 +- llvm/test/CodeGen/X86/sshl_sat_vec.ll | 4 +- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 12 +- llvm/test/CodeGen/X86/uadd_sat_vec.ll | 36 +- llvm/test/CodeGen/X86/umax.ll | 25 +- .../X86/urem-seteq-vec-tautological.ll | 2 +- llvm/test/CodeGen/X86/usub_sat_vec.ll | 36 +- llvm/test/CodeGen/X86/vec_anyext.ll | 2 +- llvm/test/CodeGen/X86/vec_cmp_uint-128.ll | 8 +- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 77 +- llvm/test/CodeGen/X86/vec_minmax_uint.ll | 8 +- llvm/test/CodeGen/X86/vec_smulo.ll | 6 +- llvm/test/CodeGen/X86/vec_uaddo.ll | 31 +- llvm/test/CodeGen/X86/vec_umulo.ll | 6 +- llvm/test/CodeGen/X86/vec_usubo.ll | 31 +- llvm/test/CodeGen/X86/vector-bitreverse.ll | 408 +- llvm/test/CodeGen/X86/vector-blend.ll | 34 +- llvm/test/CodeGen/X86/vector-fshl-128.ll | 159 +- llvm/test/CodeGen/X86/vector-fshl-256.ll | 24 +- llvm/test/CodeGen/X86/vector-fshl-512.ll | 32 +- llvm/test/CodeGen/X86/vector-fshl-rot-128.ll | 40 +- llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 18 +- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll | 12 +- .../CodeGen/X86/vector-fshl-rot-sub128.ll | 6 +- llvm/test/CodeGen/X86/vector-fshr-128.ll | 293 +- llvm/test/CodeGen/X86/vector-fshr-256.ll | 38 +- llvm/test/CodeGen/X86/vector-fshr-512.ll | 52 +- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 85 +- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 32 +- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll | 20 +- .../CodeGen/X86/vector-fshr-rot-sub128.ll | 6 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll | 8 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll | 14 +- llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll | 38 +- llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll | 6 +- llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll | 24 +- .../vector-interleaved-load-i16-stride-3.ll | 10 +- .../vector-interleaved-load-i16-stride-4.ll | 35 +- .../vector-interleaved-load-i16-stride-5.ll | 80 +- .../vector-interleaved-load-i16-stride-6.ll | 170 +- .../vector-interleaved-load-i16-stride-7.ll | 624 +- .../vector-interleaved-load-i16-stride-8.ll | 1557 +- .../vector-interleaved-load-i32-stride-3.ll | 4 +- .../vector-interleaved-load-i32-stride-4.ll | 4 +- .../vector-interleaved-load-i32-stride-6.ll | 14 +- .../vector-interleaved-load-i32-stride-7.ll | 53 +- .../vector-interleaved-load-i32-stride-8.ll | 8930 ++++++-- .../vector-interleaved-load-i64-stride-6.ll | 6 +- .../vector-interleaved-load-i64-stride-7.ll | 4423 +++- .../vector-interleaved-load-i64-stride-8.ll | 11432 ++++++++-- .../vector-interleaved-load-i8-stride-2.ll | 74 +- .../vector-interleaved-load-i8-stride-3.ll | 209 +- .../vector-interleaved-load-i8-stride-4.ll | 156 +- .../vector-interleaved-load-i8-stride-5.ll | 109 +- .../vector-interleaved-load-i8-stride-6.ll | 22 +- .../vector-interleaved-load-i8-stride-7.ll | 423 +- .../vector-interleaved-load-i8-stride-8.ll | 1545 +- .../vector-interleaved-store-i16-stride-3.ll | 12 +- .../vector-interleaved-store-i16-stride-5.ll | 72 +- .../vector-interleaved-store-i16-stride-6.ll | 33 +- .../vector-interleaved-store-i16-stride-7.ll | 3209 ++- .../vector-interleaved-store-i32-stride-6.ll | 6 +- .../vector-interleaved-store-i32-stride-7.ll | 12 +- .../vector-interleaved-store-i64-stride-6.ll | 4236 +++- .../vector-interleaved-store-i64-stride-7.ll | 216 +- .../vector-interleaved-store-i64-stride-8.ll | 17334 +++++++++++++--- .../vector-interleaved-store-i8-stride-3.ll | 12 +- .../vector-interleaved-store-i8-stride-5.ll | 629 +- .../vector-interleaved-store-i8-stride-6.ll | 892 +- .../vector-interleaved-store-i8-stride-7.ll | 1394 +- llvm/test/CodeGen/X86/vector-lzcnt-256.ll | 96 +- llvm/test/CodeGen/X86/vector-lzcnt-512.ll | 80 +- .../CodeGen/X86/vector-popcnt-128-ult-ugt.ll | 988 +- llvm/test/CodeGen/X86/vector-popcnt-128.ll | 173 +- .../CodeGen/X86/vector-popcnt-256-ult-ugt.ll | 1368 +- llvm/test/CodeGen/X86/vector-popcnt-256.ll | 30 +- .../CodeGen/X86/vector-popcnt-512-ult-ugt.ll | 2534 ++- llvm/test/CodeGen/X86/vector-popcnt-512.ll | 55 +- .../CodeGen/X86/vector-reduce-add-mask.ll | 4 +- .../CodeGen/X86/vector-reduce-and-bool.ll | 76 +- llvm/test/CodeGen/X86/vector-reduce-umax.ll | 31 +- llvm/test/CodeGen/X86/vector-reduce-umin.ll | 31 +- .../CodeGen/X86/vector-replicaton-i1-mask.ll | 3 +- llvm/test/CodeGen/X86/vector-rotate-128.ll | 31 +- llvm/test/CodeGen/X86/vector-rotate-256.ll | 18 +- llvm/test/CodeGen/X86/vector-rotate-512.ll | 12 +- .../test/CodeGen/X86/vector-shift-ashr-128.ll | 136 +- .../test/CodeGen/X86/vector-shift-ashr-256.ll | 54 +- .../test/CodeGen/X86/vector-shift-ashr-512.ll | 14 +- .../CodeGen/X86/vector-shift-ashr-sub128.ll | 157 +- .../test/CodeGen/X86/vector-shift-lshr-512.ll | 6 +- .../CodeGen/X86/vector-shift-lshr-sub128.ll | 4 +- llvm/test/CodeGen/X86/vector-shift-shl-256.ll | 8 +- llvm/test/CodeGen/X86/vector-shift-shl-512.ll | 8 +- .../CodeGen/X86/vector-shift-shl-sub128.ll | 4 +- .../CodeGen/X86/vector-shuffle-128-v16.ll | 148 +- .../CodeGen/X86/vector-shuffle-256-v16.ll | 2 +- .../CodeGen/X86/vector-shuffle-256-v32.ll | 20 +- .../CodeGen/X86/vector-shuffle-512-v32.ll | 2 +- .../CodeGen/X86/vector-shuffle-512-v64.ll | 18 +- .../test/CodeGen/X86/vector-shuffle-avx512.ll | 4 +- .../X86/vector-shuffle-combining-avx512bw.ll | 12 +- .../CodeGen/X86/vector-shuffle-combining.ll | 46 +- llvm/test/CodeGen/X86/vector-shuffle-v192.ll | 21 +- llvm/test/CodeGen/X86/vector-shuffle-v48.ll | 3 +- llvm/test/CodeGen/X86/vector-trunc-math.ll | 15 +- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 145 +- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 150 +- llvm/test/CodeGen/X86/vector-trunc-usat.ll | 148 +- llvm/test/CodeGen/X86/vector-trunc.ll | 28 +- llvm/test/CodeGen/X86/vector-tzcnt-128.ll | 334 +- llvm/test/CodeGen/X86/vector-tzcnt-256.ll | 140 +- llvm/test/CodeGen/X86/vector-tzcnt-512.ll | 90 +- llvm/test/CodeGen/X86/vector-unsigned-cmp.ll | 118 +- ...vector_splat-const-shift-of-constmasked.ll | 200 +- llvm/test/CodeGen/X86/vselect-avx.ll | 2 +- llvm/test/CodeGen/X86/vselect-minmax.ll | 16 +- llvm/test/CodeGen/X86/vselect-pcmp.ll | 2 +- llvm/test/CodeGen/X86/vselect-post-combine.ll | 2 +- .../CodeGen/X86/x86-interleaved-access.ll | 63 +- .../CodeGen/X86/zero_extend_vector_inreg.ll | 3 +- .../zero_extend_vector_inreg_of_broadcast.ll | 46 +- ...d_vector_inreg_of_broadcast_from_memory.ll | 43 +- 181 files changed, 51532 insertions(+), 17393 deletions(-) diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index 03e474b9e2e18..161978be7ff62 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -231,6 +231,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, unsigned Opc = MI.getOpcode(); MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool(); bool HasDQI = ST->hasDQI(); + bool HasBWI = ST->hasBWI(); auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64, unsigned OpBcst32, @@ -305,6 +306,47 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, HasDQI ? X86::VBROADCASTF32X8rm : X86::VBROADCASTF64X4rm, HasDQI ? X86::VBROADCASTF64X2rm : X86::VBROADCASTF32X4rm, X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 1); + /* Integer Loads */ + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + if (ST->hasAVX2()) + return ConvertToBroadcast(0, 0, X86::VPBROADCASTQrm, X86::VPBROADCASTDrm, + X86::VPBROADCASTWrm, X86::VPBROADCASTBrm, 1); + return false; + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + if (ST->hasAVX2()) + return ConvertToBroadcast(0, X86::VBROADCASTI128, X86::VPBROADCASTQYrm, + X86::VPBROADCASTDYrm, X86::VPBROADCASTWYrm, + X86::VPBROADCASTBYrm, 1); + return false; + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQU64Z128rm: + return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm, + X86::VPBROADCASTDZ128rm, + HasBWI ? X86::VPBROADCASTWZ128rm : 0, + HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1); + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQU64Z256rm: + return ConvertToBroadcast( + 0, HasDQI ? X86::VBROADCASTI64X2Z128rm : X86::VBROADCASTI32X4Z256rm, + X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm, + HasBWI ? X86::VPBROADCASTWZ256rm : 0, + HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1); + case X86::VMOVDQA32Zrm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQU64Zrm: + return ConvertToBroadcast( + HasDQI ? X86::VBROADCASTI32X8rm : X86::VBROADCASTI64X4rm, + HasDQI ? X86::VBROADCASTI64X2rm : X86::VBROADCASTI32X4rm, + X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm, + HasBWI ? X86::VPBROADCASTWZrm : 0, HasBWI ? X86::VPBROADCASTBZrm : 0, + 1); } // Attempt to find a AVX512 mapping from a full width memory-fold instruction diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll index f57c6e45dc7f4..b10a6fabb05b6 100644 --- a/llvm/test/CodeGen/X86/abdu-vector-128.ll +++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll @@ -261,7 +261,7 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: abd_ext_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -338,7 +338,7 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: abd_ext_v2i64_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -508,7 +508,7 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: abd_minmax_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -675,7 +675,7 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: abd_cmp_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -757,7 +757,7 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin ; ; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 99a15ef81b9aa..d9ce46f202423 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1221,7 +1221,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1346,7 +1346,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1466,7 +1466,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -2695,7 +2696,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2965,7 +2966,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3244,7 +3245,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -4834,7 +4836,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,7] +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7] +; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -4955,7 +4958,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,7] +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7] +; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index f64d9aa3280f0..bae04d9dc769b 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1012,7 +1012,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1110,7 +1110,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1202,7 +1202,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -2142,7 +2143,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2375,7 +2376,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2612,7 +2613,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index e2e5cadf9d465..2dc33d32e9d3a 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -174,7 +174,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X32-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; X32-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -188,7 +188,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X64-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; X64-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll index 15e2c3890354f..7cf459e566617 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1497,27 +1497,27 @@ define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) { define <2 x i64> @test_x86_avx2_psrlv_q_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,0,4,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,0,4,0] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # EVEX TO VEX Compression xmm0 = [4,0,4,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # EVEX TO VEX Compression xmm0 = [4,0,4,0] +; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4] -; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,4] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] @@ -1554,18 +1554,18 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) { define <4 x i64> @test_x86_avx2_psrlv_q_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [4,0,4,0,4,0,4,0] -; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 # EVEX TO VEX Compression ymm0 = [4,0,4,0,4,0,4,0] +; X86-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll index 7f163ef266c7a..f70c547958519 100644 --- a/llvm/test/CodeGen/X86/avx2-shift.ll +++ b/llvm/test/CodeGen/X86/avx2-shift.ll @@ -377,7 +377,7 @@ define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind { ; X86: # %bb.0: ; X86-NEXT: vpsrlw $3, %ymm0, %ymm0 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X86-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X86-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl @@ -386,7 +386,7 @@ define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind { ; X64: # %bb.0: ; X64-NEXT: vpsrlw $3, %ymm0, %ymm0 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X64-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll index 7348a8a6db8c7..8fb7c65a9a60b 100644 --- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -276,7 +276,8 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; ; X86-FAST-ALL-LABEL: srl_trunc_and_v4i64: ; X86-FAST-ALL: # %bb.0: -; X86-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; X86-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; X86-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 @@ -306,7 +307,8 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { ; ; X64-FAST-ALL-LABEL: srl_trunc_and_v4i64: ; X64-FAST-ALL: # %bb.0: -; X64-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; X64-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; X64-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] ; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index 5c333da422dcc..25e297993bd7c 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -293,7 +293,7 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; AVX512F-LABEL: imulq128_bcast: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -313,7 +313,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; ; AVX512BW-LABEL: imulq128_bcast: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -324,7 +324,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; AVX512DQ-LABEL: imulq128_bcast: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index f4c6532e8da35..780abc9f9dc43 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6775,7 +6775,7 @@ define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] +; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm2 @@ -7978,7 +7978,7 @@ define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648] ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -8128,7 +8128,7 @@ define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) { ; X86: # %bb.0: # %entry ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647] +; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647] ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index cc0da34453eb5..ec0f14ae4e58e 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1649,7 +1649,8 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,5,3,2,u,u,u,u> +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,5,3,2,15,5,3,2] +; CHECK-NEXT: # ymm3 = mem[0,1,0,1] ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} @@ -1666,7 +1667,8 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32 ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <15,5,3,2,u,u,u,u> +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,5,3,2,15,5,3,2] +; CHECK-NEXT: # ymm1 = mem[0,1,0,1] ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 @@ -1993,7 +1995,8 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [6,3,6,3] +; CHECK-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,3,6,3] +; CHECK-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm0 ; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-FAST-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} @@ -2016,7 +2019,8 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,3,6,3] +; CHECK-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3] +; CHECK-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} ; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index be7ac666cbbec..17d6266ab7c9e 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1412,18 +1412,20 @@ define <32 x i16>@test_int_x86_avx512_maskz_psrav32_hi(<32 x i16> %x0, <32 x i16 define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] -; X86-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X86-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] +; X86-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x5a,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_psrav32_hi_const: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] -; X64-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X64-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51] +; X64-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x5a,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index a08200fde8e78..c22ec12c1b207 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -842,7 +842,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index c092ed4f9f668..8687dae107f24 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -961,7 +961,7 @@ define i1 @trunc_v32i16_cmp(<32 x i16> %a0) nounwind { ; ; AVX512-LABEL: trunc_v32i16_cmp: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 ; AVX512-NEXT: kortestw %k0, %k0 diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll index 23f56d908cb2e..6a0dc4ae0695d 100644 --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -377,7 +377,8 @@ define <4 x i64> @bitselect_v4i64_rm(<4 x i64>, ptr nocapture readonly) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -429,7 +430,8 @@ define <4 x i64> @bitselect_v4i64_mr(ptr nocapture readonly, <4 x i64>) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -481,7 +483,8 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -489,7 +492,8 @@ define <4 x i64> @bitselect_v4i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512VL-LABEL: bitselect_v4i64_mm: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; AVX512VL-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512VL-NEXT: vpternlogq $202, (%rdi), %ymm1, %ymm0 ; AVX512VL-NEXT: retq %3 = load <4 x i64>, ptr %0 @@ -849,7 +853,8 @@ define <8 x i64> @bitselect_v8i64_mm(ptr nocapture readonly, ptr nocapture reado ; AVX512-LABEL: bitselect_v8i64_mm: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] +; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpternlogq $202, (%rdi), %zmm1, %zmm0 ; AVX512-NEXT: retq %3 = load <8 x i64>, ptr %0 diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll index a8923210427e5..4a3b765924513 100644 --- a/llvm/test/CodeGen/X86/combine-pavg.ll +++ b/llvm/test/CodeGen/X86/combine-pavg.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone @@ -31,17 +31,29 @@ define <16 x i8> @combine_pavgw_knownbits(<8 x i16> %a0, <8 x i16> %a1, <8 x i16 ; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_pavgw_knownbits: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] -; AVX-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm4, %xmm2, %xmm1 -; AVX-NEXT: vpand %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpavgw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_pavgw_knownbits: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pavgw_knownbits: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm1 +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpavgw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %m0 = and <8 x i16> %a0, %m1 = and <8 x i16> %a1, %m2 = and <8 x i16> %a2, diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index bcdcfdd714784..8ede23616d65e 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -421,7 +421,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -437,7 +437,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll index e443e8472f31f..a05da63e43e12 100644 --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -148,7 +148,8 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; ; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and: ; AVX-FAST-ALL: # %bb.0: -; AVX-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; AVX-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-smax.ll b/llvm/test/CodeGen/X86/combine-smax.ll index efaeb97b89d65..828a36d4450ae 100644 --- a/llvm/test/CodeGen/X86/combine-smax.ll +++ b/llvm/test/CodeGen/X86/combine-smax.ll @@ -2,10 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2 define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: test_v16i8_nosignbit: @@ -32,13 +32,21 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; SSE42-NEXT: pmaxsb %xmm2, %xmm0 ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v16i8_nosignbit: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i8_nosignbit: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i8_nosignbit: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = and <16 x i8> %a, %2 = and <16 x i8> %b, %3 = icmp sgt <16 x i8> %1, %2 diff --git a/llvm/test/CodeGen/X86/combine-smin.ll b/llvm/test/CodeGen/X86/combine-smin.ll index b228b153f6eef..6a69bd06d85e0 100644 --- a/llvm/test/CodeGen/X86/combine-smin.ll +++ b/llvm/test/CodeGen/X86/combine-smin.ll @@ -2,10 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2 define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: test_v16i8_nosignbit: @@ -32,13 +32,21 @@ define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; SSE42-NEXT: pminsb %xmm2, %xmm0 ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v16i8_nosignbit: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i8_nosignbit: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i8_nosignbit: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %1 = and <16 x i8> %a, %2 = and <16 x i8> %b, %3 = icmp slt <16 x i8> %1, %2 diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll index 70e0128629898..db37db7ec1be5 100644 --- a/llvm/test/CodeGen/X86/combine-sra.ll +++ b/llvm/test/CodeGen/X86/combine-sra.ll @@ -178,7 +178,8 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index d2704e8f7af3a..5c69fe9055971 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -214,7 +214,8 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) { ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_lshr1: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -422,7 +423,8 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) { ; ; AVX2-FAST-ALL-LABEL: combine_vec_lshr_trunc_and: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll index a542ab95c76b6..b898be5941ed1 100644 --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -373,7 +373,7 @@ define <4 x float> @mismatch_tofp_v4i32_v4f32(<2 x i32> %x, <2 x i32> %y) { ; AVX2-LABEL: mismatch_tofp_v4i32_v4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll index edfcd94b43ae6..1baaab0931cb9 100644 --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -788,7 +788,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_unary(<32 ; CHECK: # %bb.0: ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1] ; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16] -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> @@ -800,7 +801,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_binary(<3 ; CHECK: # %bb.0: ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16] -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> @@ -857,7 +859,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_unary(<32 ; CHECK: # %bb.0: ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3] ; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16] -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> @@ -869,7 +872,8 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_binary(<3 ; CHECK: # %bb.0: ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; CHECK-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16] -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] +; CHECK-NEXT: # ymm2 = mem[0,1,0,1] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll index 2e25b5a0e1c03..906fead7f8db5 100644 --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -53,7 +53,7 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpsllw $4, %xmm1, %xmm1 ; CHECK-NEXT: vpsrlw $4, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; CHECK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1 ; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 @@ -107,7 +107,7 @@ entry: define i32 @mul_zext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-LABEL: mul_zext_i4i4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index 93d6a9f3fc9a5..d4dd264bfb5ef 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -288,7 +288,7 @@ define void @freeze_buildvector_single_maybe_poison_operand(ptr %origin, ptr %ds ; ; X64-LABEL: freeze_buildvector_single_maybe_poison_operand: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = +; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42] ; X64-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -322,7 +322,7 @@ define void @freeze_buildvector_single_repeated_maybe_poison_operand(ptr %origin ; ; X64-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = +; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42] ; X64-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 ; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll index 74eb3a56ef672..6fe3de87f9827 100644 --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -238,7 +238,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; GFNIAVX2-LABEL: splatconstant_fshl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $7, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -323,7 +323,7 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $2, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] ; GFNIAVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll index 7b79b02751164..ff9dc24e65a64 100644 --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -242,7 +242,7 @@ define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX2-LABEL: splatconstant_rotl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -328,7 +328,7 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX2-LABEL: splatconstant_rotr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] ; GFNIAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index d5ed003c45092..14efd6ab1f6c8 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -62,19 +62,28 @@ define <16 x i8> @splatconstant_ashr_v16i8(<16 x i8> %a) nounwind { ; GFNISSE-NEXT: psubb %xmm1, %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX1OR2-LABEL: splatconstant_ashr_v16i8: -; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; GFNIAVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: retq +; GFNIAVX1-LABEL: splatconstant_ashr_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_ashr_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_ashr_v16i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; GFNIAVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; GFNIAVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; GFNIAVX512-NEXT: retq @@ -193,7 +202,7 @@ define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; GFNIAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -201,7 +210,7 @@ define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512-LABEL: splatconstant_ashr_v32i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $2, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; GFNIAVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 ; GFNIAVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq @@ -247,7 +256,7 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX2-LABEL: splatconstant_shl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsllw $5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -296,7 +305,7 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX2-LABEL: splatconstant_lshr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsrlw $7, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -363,9 +372,9 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX2-LABEL: splatconstant_ashr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; GFNIAVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 @@ -377,7 +386,7 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX512-LABEL: splatconstant_ashr_v64i8: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsrlw $1, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; GFNIAVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; GFNIAVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; GFNIAVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index 0c59e880dfd37..3af28d3b4c966 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -122,7 +122,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index b64b0bf244139..5985dcae91842 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -124,7 +124,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) { ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll index 0b1137ff96643..f2f12654e6834 100644 --- a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll @@ -217,7 +217,7 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -226,7 +226,7 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll index 364fd81eb1aa9..646ca1709c503 100644 --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -150,11 +150,23 @@ define <2 x i64> @elt0_v2i64(i64 %x) { ; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X86-AVX-NEXT: retl ; -; X64-AVX-LABEL: elt0_v2i64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = -; X64-AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: elt0_v2i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = +; X64-AVX1-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: elt0_v2i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; X64-AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512F-LABEL: elt0_v2i64: +; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; X64-AVX512F-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX512F-NEXT: retq %ins = insertelement <2 x i64> , i64 %x, i32 0 ret <2 x i64> %ins } diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll index d65bf782d7994..e3c5a5023ac9e 100644 --- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -425,7 +425,7 @@ define <2 x i64> @reassociate_umax_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> ; AVX2-LABEL: reassociate_umax_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5 ; AVX2-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4 @@ -723,7 +723,7 @@ define <2 x i64> @reassociate_umin_v2i64(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> ; AVX2-LABEL: reassociate_umin_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5 ; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 17548df343251..a6f4296ae5640 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -1494,7 +1494,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -3915,7 +3915,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index f605cd8271495..ffb3142df9fa9 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -2238,7 +2238,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -2451,10 +2451,10 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2147483647,2147483647] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2588,34 +2588,63 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB7_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB7_3 -; AVX-NEXT: .LBB7_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB7_1: # %cond.store -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: .LBB7_3: # %cond.store1 -; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB7_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB7_3 +; AVX1-NEXT: .LBB7_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB7_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB7_4 +; AVX1-NEXT: .LBB7_3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [32767,32767] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB7_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB7_3 +; AVX2-NEXT: .LBB7_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB7_1: # %cond.store +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB7_4 +; AVX2-NEXT: .LBB7_3: # %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: ; AVX512F: # %bb.0: @@ -2756,33 +2785,61 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB8_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB8_3 -; AVX-NEXT: .LBB8_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB8_1: # %cond.store -; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: .LBB8_3: # %cond.store1 -; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB8_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB8_3 +; AVX1-NEXT: .LBB8_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB8_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB8_4 +; AVX1-NEXT: .LBB8_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB8_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB8_3 +; AVX2-NEXT: .LBB8_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB8_1: # %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB8_4 +; AVX2-NEXT: .LBB8_3: # %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 682e2002c075a..e288692d9eb89 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -1929,7 +1929,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -2125,7 +2125,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295] ; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2239,34 +2239,63 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535] -; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] -; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB7_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB7_3 -; AVX-NEXT: .LBB7_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB7_1: # %cond.store -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: .LBB7_3: # %cond.store1 -; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB7_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB7_3 +; AVX1-NEXT: .LBB7_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB7_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB7_4 +; AVX1-NEXT: .LBB7_3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB7_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB7_3 +; AVX2-NEXT: .LBB7_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB7_1: # %cond.store +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB7_4 +; AVX2-NEXT: .LBB7_3: # %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: ; AVX512F: # %bb.0: @@ -2387,33 +2416,61 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pextrb $1, %xmm3, 1(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [255,255] -; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB8_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB8_3 -; AVX-NEXT: .LBB8_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB8_1: # %cond.store -; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: .LBB8_3: # %cond.store1 -; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [255,255] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB8_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB8_3 +; AVX1-NEXT: .LBB8_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB8_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB8_4 +; AVX1-NEXT: .LBB8_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [255,255] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB8_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB8_3 +; AVX2-NEXT: .LBB8_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB8_1: # %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB8_4 +; AVX2-NEXT: .LBB8_3: # %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -5867,7 +5924,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX2-LABEL: truncstore_v32i16_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpminuw %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpminuw %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 @@ -6107,7 +6164,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index 7e6cfc56574f5..c34ffc554f226 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -975,7 +975,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1011,7 +1011,7 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1115,27 +1115,49 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: vec128_i64_unsigned_reg_reg: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq $1, %xmm1, %xmm2 -; AVX-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 -; AVX-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: vec128_i64_unsigned_reg_reg: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_i64_unsigned_reg_reg: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlq $33, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm3, %xmm4 +; AVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: vec128_i64_unsigned_reg_reg: ; XOP: # %bb.0: @@ -1162,7 +1184,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1198,7 +1220,7 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1352,7 +1374,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1389,7 +1411,7 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1542,7 +1564,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1579,7 +1601,7 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1735,7 +1757,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1773,7 +1795,7 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1877,7 +1899,7 @@ define <8 x i16> @vec128_i16_signed_reg_reg(<8 x i16> %a1, <8 x i16> %a2) nounwi ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2002,7 +2024,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 @@ -2110,7 +2132,7 @@ define <8 x i16> @vec128_i16_signed_mem_reg(ptr %a1_addr, <8 x i16> %a2) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 @@ -2218,7 +2240,7 @@ define <8 x i16> @vec128_i16_signed_reg_mem(<8 x i16> %a1, ptr %a2_addr) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2331,7 +2353,7 @@ define <8 x i16> @vec128_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2565,7 +2587,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 @@ -2798,7 +2820,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 @@ -3040,7 +3062,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 @@ -3280,7 +3302,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 @@ -3529,7 +3551,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 960a55f01aec9..8d9ec21971f16 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1333,7 +1333,7 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 @@ -1461,7 +1461,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 @@ -1589,7 +1589,7 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 @@ -1717,7 +1717,7 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 @@ -1850,7 +1850,7 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 @@ -1949,7 +1949,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2009,7 +2009,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2037,7 +2037,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 @@ -2136,7 +2136,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2197,7 +2197,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2226,7 +2226,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 @@ -2324,7 +2324,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2386,7 +2386,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2415,7 +2415,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 @@ -2513,7 +2513,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2575,7 +2575,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2604,7 +2604,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 @@ -2704,7 +2704,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2768,7 +2768,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2798,7 +2798,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 57fdd3efcf231..8dffb2c855926 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -918,7 +918,7 @@ define dso_local void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -995,7 +995,7 @@ define dso_local void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; CHECK-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 1e31d88e88f3f..38b77c6c09b7d 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -1929,7 +1929,7 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) { ; KNL-LABEL: allones_v2i64_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1] +; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] ; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al @@ -3075,7 +3075,7 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) { ; KNL-LABEL: allones_v2i64_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index fa36c15b6445a..529e0ad24936a 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1994,7 +1994,8 @@ define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, ptr%a2) { ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 @@ -2165,7 +2166,8 @@ define void @splat3_256(<32 x i8> %a0, ptr%a1) { ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll index 40d6ec6fb3155..06388362f91b7 100644 --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -486,7 +486,7 @@ define <64 x i8> @test14(<64 x i8> %x) { ; ; AVX2-LABEL: test14: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -526,7 +526,7 @@ define <64 x i8> @test15(<64 x i8> %x) { ; ; AVX2-LABEL: test15: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -566,7 +566,7 @@ define <64 x i8> @test16(<64 x i8> %x) { ; ; AVX2-LABEL: test16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -641,7 +641,7 @@ define <64 x i8> @test17(<64 x i8> %x) { ; ; AVX2-LABEL: test17: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -694,7 +694,7 @@ define <64 x i8> @test18(<64 x i8> %x) { ; ; AVX2-LABEL: test18: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1238,7 +1238,7 @@ define <32 x i16> @test32(<32 x i16> %x) { ; ; AVX2-LABEL: test32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1278,7 +1278,7 @@ define <32 x i16> @test33(<32 x i16> %x) { ; ; AVX2-LABEL: test33: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1318,7 +1318,7 @@ define <32 x i16> @test34(<32 x i16> %x) { ; ; AVX2-LABEL: test34: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1449,7 +1449,7 @@ define <32 x i16> @test35(<32 x i16> %x) { ; ; AVX2-LABEL: test35: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1 @@ -1502,7 +1502,7 @@ define <32 x i16> @test36(<32 x i16> %x) { ; ; AVX2-LABEL: test36: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll index 2919a3019e75d..a553fbe73b227 100644 --- a/llvm/test/CodeGen/X86/pmaddubsw.ll +++ b/llvm/test/CodeGen/X86/pmaddubsw.ll @@ -348,9 +348,9 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) { ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %xmm0 ; AVX256-NEXT: vmovdqa (%rsi), %xmm1 -; AVX256-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX256-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 1b2dae5f2830a..8e6ae4b552657 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -122,7 +122,7 @@ define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind { ; ; AVX-LABEL: mul_v2i64c: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117] +; AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [117,117] ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -417,9 +417,9 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; AVX2-LABEL: mul_v32i8c: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 @@ -430,9 +430,9 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { ; AVX512F-LABEL: mul_v32i8c: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 @@ -593,7 +593,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -607,7 +607,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -799,9 +799,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; AVX2-LABEL: mul_v64i8c: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 @@ -820,9 +820,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 @@ -841,9 +841,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; AVX512BW-LABEL: mul_v64i8c: ; AVX512BW: # %bb.0: # %entry ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 @@ -955,7 +955,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -980,7 +980,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -1004,7 +1004,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index 1d7aa7f2586fe..f28f07e605fd8 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -372,7 +372,7 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) { ; AVX512F-LABEL: and_mulhuw_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/pr37499.ll b/llvm/test/CodeGen/X86/pr37499.ll index 2995017275c18..15a7739fd2c7f 100644 --- a/llvm/test/CodeGen/X86/pr37499.ll +++ b/llvm/test/CodeGen/X86/pr37499.ll @@ -4,7 +4,7 @@ define <2 x i64> @undef_tval() { ; CHECK-LABEL: undef_tval: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0 {%k1} @@ -18,7 +18,7 @@ define <2 x i64> @undef_tval() { define <2 x i64> @foo(<8 x i64> %x) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1} @@ -33,7 +33,7 @@ define <2 x i64> @foo(<8 x i64> %x) { define <4 x i64> @goo(<16 x i32> %x) { ; CHECK-LABEL: goo: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpmovdw %zmm0, %ymm1 {%k1} diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll index 0ce83b190ead8..bbe46a99ffa41 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll @@ -89,7 +89,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) { define <32 x i8> @testv32i8(<32 x i8> %in) { ; AVX256-LABEL: testv32i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX256-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX256-NEXT: # ymm1 = mem[0,1,0,1] ; AVX256-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll index 06d4b6c1c197b..34e32c43ef797 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll @@ -36,7 +36,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) { define <16 x i8> @testv16i8(<16 x i8> %in) { ; AVX256-LABEL: testv16i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX256-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX256-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX256-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -60,9 +60,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) { define <16 x i16> @testv16i16(<16 x i16> %in) { ; AVX256-LABEL: testv16i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX256-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX256-NEXT: # ymm3 = mem[0,1,0,1] ; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -84,17 +85,44 @@ define <16 x i16> @testv16i16(<16 x i16> %in) { } define <32 x i8> @testv32i8(<32 x i8> %in) { -; CHECK-LABEL: testv32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX256-LABEL: testv32i8: +; AVX256: # %bb.0: +; AVX256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX256-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX256-NEXT: # ymm3 = mem[0,1,0,1] +; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX256-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX256-NEXT: retq +; +; AVX512VL-LABEL: testv32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512F-LABEL: testv32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) ret <32 x i8> %out } @@ -103,3 +131,5 @@ declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index 91d4aa6c91dbb..f627560f9f382 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -9,7 +9,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) { ; AVX256BW: # %bb.0: ; AVX256BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX256BW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX256BW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -61,7 +61,7 @@ define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX256BW-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX256BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX256BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index faeaef7b40a62..b4b2adba2314f 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -2445,7 +2445,7 @@ define <64 x i8> @test27(<64 x i8> %x) { ; ; AVX2-LABEL: test27: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index cb89a6595ad3b..3207fe99b6850 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -530,14 +530,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -545,14 +545,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512F-LABEL: v16i4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -560,13 +560,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512BW-LABEL: v16i4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index 48a3155cea341..f41d105b6f4f4 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -659,7 +659,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) { ; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573] ; AVX2-NEXT: # xmm1 = mem[0,0] ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -726,7 +726,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) { ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -785,7 +785,7 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) { ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -1267,7 +1267,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i ; ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll index 9b130f1f13841..483c16d6531b4 100644 --- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll +++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll @@ -108,8 +108,8 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-AVX2-NEXT: movq 32(%rsi), %rdx ; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ; CHECK-AVX2-NEXT: xorl %esi, %esi -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1] -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2] +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,2] ; CHECK-AVX2-NEXT: .p2align 4, 0x90 ; CHECK-AVX2-NEXT: .LBB0_1: # %vector.ph ; CHECK-AVX2-NEXT: # =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll index 52dd300f4efeb..3b9c5a34bcccc 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL @@ -12,22 +12,33 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v16i8_1: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v16i8_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v16i8_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -38,7 +49,7 @@ define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -135,16 +146,27 @@ define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v8i8_1: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v8i8_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v8i8_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512F: # %bb.0: @@ -184,16 +206,27 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v8i8_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v8i8_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v8i8_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2: ; AVX512F: # %bb.0: @@ -233,16 +266,27 @@ define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v8i8_3: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v8i8_3: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v8i8_3: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3: ; AVX512F: # %bb.0: @@ -489,16 +533,27 @@ define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v4i8_1: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v4i8_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v4i8_1: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vmovd %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512F: # %bb.0: @@ -538,16 +593,27 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v4i8_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v4i8_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v4i8_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vmovd %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512F: # %bb.0: @@ -587,16 +653,27 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v4i8_3: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v4i8_3: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v4i8_3: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vmovd %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512F: # %bb.0: @@ -636,16 +713,27 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v4i8_4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v4i8_4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v4i8_4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vmovd %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512F: # %bb.0: @@ -685,16 +773,27 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v4i8_5: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v4i8_5: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v4i8_5: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vmovd %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5: ; AVX512F: # %bb.0: @@ -734,16 +833,27 @@ define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v4i8_6: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v4i8_6: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v4i8_6: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vmovd %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6: ; AVX512F: # %bb.0: @@ -783,16 +893,27 @@ define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v4i8_7: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v4i8_7: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v4i8_7: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vmovd %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index b042ce13bd627..07e1c56a11de4 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -18,18 +18,27 @@ ; Ideally, the shuffles should be lowered to code with the same quality as the truncates. define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand 16(%rdi), %xmm0, %xmm1 -; AVX-NEXT: vpand (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX2-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand 16(%rdi), %xmm0, %xmm1 ; AVX512F-NEXT: vpand (%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -38,7 +47,7 @@ define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand 16(%rdi), %xmm0, %xmm1 ; AVX512VL-NEXT: vpand (%rdi), %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -373,16 +382,27 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8: ; AVX512F: # %bb.0: @@ -427,16 +447,27 @@ define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind { } define void @trunc_v8i32_to_v8i8(ptr %L, ptr %S) nounwind { -; AVX-LABEL: trunc_v8i32_to_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v8i32_to_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v8i32_to_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vmovq %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v8i32_to_v8i8: ; AVX512F: # %bb.0: @@ -498,7 +529,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -671,7 +702,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -800,7 +831,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no ; ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -871,7 +903,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no ; ; AVX2-FAST-ALL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -999,7 +1032,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { ; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1156,16 +1189,27 @@ define void @trunc_v4i64_to_v4i16(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vmovd %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8: ; AVX512F: # %bb.0: @@ -1210,16 +1254,27 @@ define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind { } define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind { -; AVX-LABEL: trunc_v4i64_to_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v4i64_to_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_v4i64_to_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vmovd %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i8: ; AVX512F: # %bb.0: @@ -1281,7 +1336,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX2-LABEL: negative: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1291,7 +1347,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX512F-LABEL: negative: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1310,7 +1367,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; AVX512BW-LABEL: negative: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1330,7 +1388,8 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; ; AVX512VBMIVL-LABEL: negative: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = <32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMIVL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; AVX512VBMIVL-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 ; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512VBMIVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 9a1d3ad7733a2..6e357a5fb34f5 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -382,7 +382,7 @@ define <4 x double> @PR34175(ptr %p) { ; ; AVX512BW-LABEL: PR34175: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40] ; AVX512BW-NEXT: vmovdqu (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 @@ -392,7 +392,7 @@ define <4 x double> @PR34175(ptr %p) { ; ; AVX512BWVL-LABEL: PR34175: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u> +; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24] ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -401,7 +401,7 @@ define <4 x double> @PR34175(ptr %p) { ; ; AVX512VBMI-LABEL: PR34175: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = <0,8,32,40,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,8,32,40,0,8,32,40,0,8,32,40,0,8,32,40] ; AVX512VBMI-NEXT: vmovdqu (%rdi), %ymm1 ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %ymm2 ; AVX512VBMI-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 @@ -411,7 +411,7 @@ define <4 x double> @PR34175(ptr %p) { ; ; AVX512VBMIVL-LABEL: PR34175: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u> +; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24] ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll index 99024f6bba218..8e330c3bfc676 100644 --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -347,7 +347,7 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) { ; ; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] @@ -407,7 +407,7 @@ define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) { ; ; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -490,7 +490,7 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) { ; ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index f52132587c1df..fb4ad4a61426f 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2434,7 +2434,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 ; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[8],zero,ymm0[9],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[13],zero,zero,zero,ymm0[15],zero,zero,zero,ymm0[25],zero,zero,zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,zero,zero,ymm0[31],zero ; CHECK-AVX2-NEXT: vpackuswb %ymm6, %ymm4, %ymm4 @@ -2471,7 +2471,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; CHECK-AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; CHECK-AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index 72a3e74ff0a7f..531297af2a309 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -353,7 +353,7 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X64-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vpcmpgtw %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767] ; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0 ; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper @@ -622,7 +622,7 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; X64-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0 ; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index 21f1fd6c8da21..056cb2c28c00d 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -530,14 +530,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -545,14 +545,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512F-LABEL: v16i4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -560,13 +560,13 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; AVX512BW-LABEL: v16i4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 ; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll index 234259de2ad62..b99e13996d497 100644 --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -490,14 +490,32 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-NEXT: pminub %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: v16i4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: v16i4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: retq %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } @@ -906,7 +924,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll index e37760d1e0b09..2e4830723f9f2 100644 --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -369,14 +369,23 @@ define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; X86-LABEL: test_v2i64: ; X86: # %bb.0: diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll index f7878adbd3c95..30441fa4d1f9b 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -238,7 +238,7 @@ define <2 x i1> @t3_wide(<2 x i64> %X) nounwind { ; ; CHECK-AVX2-LABEL: t3_wide: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] ; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll index a49f383e82631..705019bc61ac8 100644 --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -489,13 +489,29 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-NEXT: psubusb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: v16i4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: v16i4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } @@ -817,7 +833,7 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1112,7 +1128,7 @@ define void @PR48223(ptr %p0) { ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdi) @@ -1124,7 +1140,7 @@ define void @PR48223(ptr %p0) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; AVX512F-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll index edba0caabc15f..020f8ea277a89 100644 --- a/llvm/test/CodeGen/X86/vec_anyext.ll +++ b/llvm/test/CodeGen/X86/vec_anyext.ll @@ -189,7 +189,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind { ; X64: # %bb.0: ; X64-NEXT: vmovdqa (%rdi), %xmm0 ; X64-NEXT: vmovdqa 16(%rdi), %xmm1 -; X64-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; X64-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll index cc130fe3427f2..1cff56efba91a 100644 --- a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll +++ b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll @@ -342,7 +342,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: ge_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 @@ -516,7 +516,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: gt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -764,7 +764,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: le_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -939,7 +939,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: lt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 24e05bd937b0c..7b1fd084dee06 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -52,14 +52,23 @@ define <2 x float> @uitofp_2i32_to_2f32(<2 x i32> %a) { ; SSE41-NEXT: cvtpd2ps %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_2i32_to_2f32: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vcvtpd2ps %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_2i32_to_2f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i32_to_2f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i32_to_2f32: ; AVX512F: # %bb.0: @@ -667,13 +676,21 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { ; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_2i32_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_2i32_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i32_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i32_to_2f64: ; AVX512F: # %bb.0: @@ -3343,13 +3360,21 @@ define <2 x double> @uitofp_load_2i32_to_2f64(ptr%a) { ; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_load_2i32_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_load_2i32_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_2i32_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_2i32_to_2f64: ; AVX512F: # %bb.0: @@ -5663,10 +5688,10 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX2-NEXT: # xmm6 = mem[0,0] @@ -5691,10 +5716,10 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 { ; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX512F-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX512F-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] ; AVX512F-NEXT: # xmm6 = mem[0,0] diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll index 49adfbf5acfd0..76faaca5912e7 100644 --- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll @@ -71,7 +71,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: max_gt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -477,7 +477,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: max_ge_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -882,7 +882,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: min_lt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1290,7 +1290,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: min_le_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 19c32d786344c..eb8627e89887d 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1816,7 +1816,7 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm4 @@ -2546,7 +2546,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm3 ; AVX2-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 @@ -2666,7 +2666,7 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm3 ; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm4 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 32d2332fd3839..3ee92921b070e 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -855,16 +855,27 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: uaddo_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm0 -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovdqa %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: uaddo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: uaddo_v2i64: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index e792fb9a8b271..3e8ee21cef40b 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1555,7 +1555,7 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX2-NEXT: vpmullw %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] @@ -2216,7 +2216,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31] ; AVX2-NEXT: vpmullw %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm7 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23] @@ -2322,7 +2322,7 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63] ; AVX512BW-NEXT: vpmullw %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm5 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55] diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index 6f63236206e0c..49d169c896507 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -902,16 +902,27 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: usubo_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm0 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovdqa %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: usubo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: usubo_v2i64: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index 3d98cc95ad05c..ace5b3da1d3f5 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -447,18 +447,44 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v16i8: ; XOP: # %bb.0: @@ -524,19 +550,47 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v8i16: ; XOP: # %bb.0: @@ -609,19 +663,47 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v4i32: ; XOP: # %bb.0: @@ -696,19 +778,47 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v2i64: ; XOP: # %bb.0: @@ -822,26 +932,30 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { ; ; AVX2-LABEL: test_bitreverse_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_bitreverse_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: retq @@ -998,13 +1112,15 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { ; AVX2-LABEL: test_bitreverse_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -1012,13 +1128,15 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { ; AVX512-LABEL: test_bitreverse_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: retq @@ -1194,13 +1312,15 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { ; AVX2-LABEL: test_bitreverse_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -1208,13 +1328,15 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { ; AVX512-LABEL: test_bitreverse_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: retq @@ -1394,13 +1516,15 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { ; AVX2-LABEL: test_bitreverse_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -1408,13 +1532,15 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { ; AVX512-LABEL: test_bitreverse_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-NEXT: retq @@ -1635,13 +1761,15 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; ; AVX2-LABEL: test_bitreverse_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3 @@ -1655,16 +1783,18 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_bitreverse_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5 ; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1675,13 +1805,15 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; ; AVX512BW-LABEL: test_bitreverse_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -1931,15 +2063,18 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; ; AVX2-LABEL: test_bitreverse_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 @@ -1954,11 +2089,13 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_bitreverse_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 @@ -1966,7 +2103,8 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -1978,13 +2116,15 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; AVX512BW-LABEL: test_bitreverse_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -2047,7 +2187,8 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; ; GFNIAVX2-LABEL: test_bitreverse_v32i16: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 @@ -2058,7 +2199,8 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; GFNIAVX512F-LABEL: test_bitreverse_v32i16: ; GFNIAVX512F: # %bb.0: ; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -2279,15 +2421,18 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; ; AVX2-LABEL: test_bitreverse_v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 @@ -2302,11 +2447,13 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; AVX512F-LABEL: test_bitreverse_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 @@ -2314,7 +2461,8 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -2326,13 +2474,15 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; AVX512BW-LABEL: test_bitreverse_v16i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -2395,7 +2545,8 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; ; GFNIAVX2-LABEL: test_bitreverse_v16i32: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 @@ -2406,7 +2557,8 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; GFNIAVX512F-LABEL: test_bitreverse_v16i32: ; GFNIAVX512F: # %bb.0: ; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -2635,15 +2787,18 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; ; AVX2-LABEL: test_bitreverse_v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 @@ -2658,11 +2813,13 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-LABEL: test_bitreverse_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 @@ -2670,7 +2827,8 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 @@ -2682,13 +2840,15 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512BW-LABEL: test_bitreverse_v8i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq @@ -2751,7 +2911,8 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; ; GFNIAVX2-LABEL: test_bitreverse_v8i64: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 @@ -2762,7 +2923,8 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; GFNIAVX512F-LABEL: test_bitreverse_v8i64: ; GFNIAVX512F: # %bb.0: ; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; GFNIAVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll index 2271db9d64038..eaa3790e4357e 100644 --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -84,11 +84,17 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_4xi8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: vsel_4xi8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_4xi8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255] +; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq entry: %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2 ret <4 x i8> %vsel @@ -262,11 +268,17 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: vsel_i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq entry: %vsel = select <16 x i1> , <16 x i8> %v1, <16 x i8> %v2 ret <16 x i8> %vsel @@ -627,7 +639,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { ; ; AVX2-LABEL: constant_pblendvb_avx2: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index caf4efbbf32c6..2feafb8950111 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -84,7 +84,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -95,7 +95,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -117,7 +117,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -167,7 +167,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX2-LABEL: var_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 @@ -547,7 +547,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -569,7 +569,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 @@ -583,18 +583,31 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX512VLVBMI2-NEXT: vpshldvw %xmm2, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v8i16: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpshlw %xmm4, %xmm0, %xmm0 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 -; XOP-NEXT: vpshlw %xmm2, %xmm1, %xmm1 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v8i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v8i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2 +; XOPAVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v8i16: ; X86-SSE2: # %bb.0: @@ -779,7 +792,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512F-LABEL: var_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -797,7 +810,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512VL-LABEL: var_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -871,19 +884,33 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; XOP-NEXT: vpsubb %xmm4, %xmm5, %xmm4 -; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: @@ -952,20 +979,31 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -987,7 +1025,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 @@ -1023,16 +1061,27 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VLVBMI2-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatvar_funnnel_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsrlq $1, %xmm1, %xmm1 -; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_funnnel_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpsrlq $1, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 0500d6ec6e1f9..13cbd9520adbe 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -395,7 +395,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -415,7 +415,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 @@ -451,7 +451,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; XOPAVX2-LABEL: var_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 @@ -573,11 +573,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; ; AVX512F-LABEL: var_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 @@ -607,11 +607,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; ; AVX512VL-LABEL: var_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6 @@ -719,7 +719,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] ; XOPAVX2-NEXT: vpaddb %xmm6, %xmm5, %xmm7 ; XOPAVX2-NEXT: vpshlb %xmm7, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 @@ -761,7 +761,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -772,7 +772,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -794,7 +794,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -849,7 +849,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index 3ffd137921d7d..8cb042c858964 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -184,7 +184,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -200,7 +200,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -222,19 +222,19 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm5 ; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm9 ; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm9 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm10, %ymm7 ; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm9 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 @@ -260,12 +260,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 @@ -291,19 +291,19 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm5 ; AVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm7 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpxor %ymm3, %ymm8, %ymm9 ; AVX512VL-NEXT: vpsllw $5, %ymm9, %ymm9 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm7 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512VL-NEXT: vpand %ymm7, %ymm10, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 @@ -329,12 +329,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 @@ -426,7 +426,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -448,7 +448,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 74b7fa84aac12..08402ab0391dd 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -87,7 +87,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -414,7 +414,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -425,7 +425,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -737,17 +737,29 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpsllq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index fc0804b7c92e2..e2fe10bfecd2b 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -318,7 +318,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -328,7 +328,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -587,7 +587,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -1316,11 +1316,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; ; AVX512BW-LABEL: constant_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1340,11 +1342,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; ; AVX512VBMI2-LABEL: constant_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index dd9689676edb9..81874d461fcde 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -37,7 +37,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] @@ -66,7 +66,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] @@ -96,7 +96,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -106,7 +106,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -143,7 +143,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 @@ -186,7 +186,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll index 537096e48b066..6fe03f54123c3 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -324,7 +324,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 924de00641efb..5e6d79b059d20 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -84,7 +84,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -95,7 +95,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -117,7 +117,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -168,7 +168,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; XOPAVX2-LABEL: var_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -604,7 +604,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -626,7 +626,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -641,18 +641,31 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v8i16: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; XOP-NEXT: vpsubw %xmm4, %xmm5, %xmm4 -; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 -; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v8i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpshlw %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v8i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpsubw %xmm4, %xmm5, %xmm4 +; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v8i16: ; X86-SSE2: # %bb.0: @@ -825,41 +838,73 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsllw $5, %xmm4, %xmm4 -; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5 -; AVX-NEXT: vpsrlw $4, %xmm1, %xmm6 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 -; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $2, %xmm1, %xmm4 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm4 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllw $5, %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpsllw $5, %xmm4, %xmm4 +; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; AVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm4 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm4 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm4 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero @@ -876,7 +921,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512VL-LABEL: var_funnnel_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero @@ -945,18 +990,31 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; XOP-NEXT: vpsubb %xmm4, %xmm5, %xmm4 -; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: @@ -1041,20 +1099,31 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1076,7 +1145,7 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1113,16 +1182,27 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatvar_funnnel_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_funnnel_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: @@ -1376,25 +1456,38 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 @@ -1407,7 +1500,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 @@ -1420,7 +1513,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 4a580c8bacabe..624e3e8471d20 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -423,7 +423,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -443,7 +443,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -483,7 +483,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; XOPAVX2-LABEL: var_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 @@ -576,7 +576,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; ; AVX2-LABEL: var_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5 @@ -608,7 +608,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; ; AVX512F-LABEL: var_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5 @@ -640,7 +640,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; ; AVX512VL-LABEL: var_funnnel_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5 @@ -752,7 +752,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 ; XOPAVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5 @@ -792,7 +792,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -803,7 +803,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -825,7 +825,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -881,7 +881,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1163,7 +1163,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1176,7 +1176,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1189,7 +1189,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1202,7 +1202,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1227,7 +1227,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1654,7 +1654,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1676,7 +1676,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1698,7 +1698,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 9630cc7876f6e..40f15af5887be 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -184,7 +184,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 @@ -201,7 +201,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpandnq %zmm3, %zmm2, %zmm2 @@ -225,16 +225,16 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 ; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpxor %ymm7, %ymm3, %ymm8 ; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 @@ -257,17 +257,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 @@ -292,16 +292,16 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 ; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpxor %ymm7, %ymm3, %ymm8 ; AVX512VL-NEXT: vpsllw $5, %ymm8, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 @@ -324,17 +324,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 @@ -361,7 +361,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm5, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] @@ -391,7 +391,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63] ; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm5, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55] @@ -424,7 +424,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -446,7 +446,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -628,7 +628,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 @@ -651,7 +651,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 @@ -672,7 +672,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 @@ -696,7 +696,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0 @@ -911,7 +911,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1] @@ -960,7 +960,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64] ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512VL-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1] @@ -984,7 +984,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 @@ -1006,7 +1006,7 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] ; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 04e4e66dd1b95..0dab0a466b179 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -87,7 +87,7 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -432,7 +432,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -443,7 +443,7 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -635,7 +635,7 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero @@ -764,17 +764,29 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: @@ -1100,25 +1112,38 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 @@ -1131,7 +1156,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 @@ -1144,7 +1169,7 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index a62b2b70abdb1..755c098309088 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -334,7 +334,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -344,7 +344,7 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -514,7 +514,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31] ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] @@ -530,7 +530,7 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLBW-NEXT: vpsrlvw %ymm3, %ymm4, %ymm3 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -618,7 +618,7 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -934,7 +934,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -947,7 +947,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -960,7 +960,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -973,7 +973,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -986,7 +986,7 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -1367,11 +1367,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; ; AVX512BW-LABEL: constant_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1391,11 +1393,13 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; ; AVX512VBMI2-LABEL: constant_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index a1abdd9565af4..1f0cf8b7affa9 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -37,7 +37,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] @@ -66,7 +66,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] @@ -96,7 +96,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -106,7 +106,7 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -213,7 +213,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] @@ -229,7 +229,7 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpsrlvw %zmm3, %zmm4, %zmm3 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] @@ -375,7 +375,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 @@ -397,7 +397,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 @@ -418,7 +418,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 @@ -431,7 +431,7 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll index 54acb196f275d..72a1422d2b9e0 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -366,7 +366,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll index 53e450856632d..511203ced00a8 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -222,7 +222,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -240,7 +240,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -619,7 +619,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -641,7 +641,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll index 279f3c464411c..549c6814d3028 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -201,7 +201,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] @@ -211,7 +211,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -228,7 +228,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -595,7 +595,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] @@ -605,7 +605,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -626,7 +626,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -739,7 +739,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2NOBW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 04761b6d35c5f..ac974bb51f54f 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -103,7 +103,7 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_div7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] ; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm3 ; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1 @@ -132,7 +132,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] @@ -141,12 +141,12 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512F-NEXT: vpxor %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1 @@ -172,7 +172,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] @@ -181,7 +181,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 @@ -221,7 +221,7 @@ define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] @@ -411,7 +411,7 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] ; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4 ; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3 @@ -448,7 +448,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] @@ -457,17 +457,17 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm3 ; AVX512F-NEXT: vpaddb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsubb %ymm8, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5 ; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 @@ -497,7 +497,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512BW-NEXT: vpmulhw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] @@ -506,7 +506,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpackuswb %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 @@ -550,12 +550,12 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm6, %ymm5 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm7, %ymm7 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 @@ -619,7 +619,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll index 805dd422ac491..b38de74f85eca 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -202,7 +202,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -624,7 +624,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -777,7 +777,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2NOBW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll index 5169dd69f39fc..c64344396990a 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -103,7 +103,7 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_div7_32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 @@ -135,7 +135,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -144,7 +144,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 @@ -169,7 +169,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] @@ -431,7 +431,7 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 @@ -471,7 +471,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] @@ -480,14 +480,14 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm5 ; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 ; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 @@ -515,7 +515,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] @@ -570,7 +570,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 @@ -641,7 +641,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 ; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm1[0],zmm2[1],zmm1[1],zmm2[2],zmm1[2],zmm2[3],zmm1[3],zmm2[4],zmm1[4],zmm2[5],zmm1[5],zmm2[6],zmm1[6],zmm2[7],zmm1[7],zmm2[16],zmm1[16],zmm2[17],zmm1[17],zmm2[18],zmm1[18],zmm2[19],zmm1[19],zmm2[20],zmm1[20],zmm2[21],zmm1[21],zmm2[22],zmm1[22],zmm2[23],zmm1[23],zmm2[32],zmm1[32],zmm2[33],zmm1[33],zmm2[34],zmm1[34],zmm2[35],zmm1[35],zmm2[36],zmm1[36],zmm2[37],zmm1[37],zmm2[38],zmm1[38],zmm2[39],zmm1[39],zmm2[48],zmm1[48],zmm2[49],zmm1[49],zmm2[50],zmm1[50],zmm2[51],zmm1[51],zmm2[52],zmm1[52],zmm2[53],zmm1[53],zmm2[54],zmm1[54],zmm2[55],zmm1[55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 67370c65bc603..6dda7005c8fc0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -264,11 +264,11 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-SLOW-LABEL: load_i16_stride3_vf4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,3,6,9,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9] ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,4,7,10,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10] ; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 ; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] @@ -281,13 +281,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FAST-LABEL: load_i16_stride3_vf4: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,3,6,9,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9] ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,4,7,10,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10] ; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <2,5,8,11,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,5,8,11,2,5,8,11] ; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm4 ; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 2c8d6573cdd14..722065453963e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -856,7 +856,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm6 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> @@ -876,7 +876,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm7 @@ -1086,7 +1086,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512F-FAST-LABEL: load_i16_stride4_vf16: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 @@ -1100,7 +1101,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm4 ; AVX512F-FAST-NEXT: vpmovqw %zmm4, %xmm9 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 @@ -1903,7 +1904,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm8 ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm12 @@ -1939,7 +1940,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm10 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> @@ -1966,7 +1967,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,3,1,3,5,7] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] @@ -2000,7 +2001,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 @@ -2398,7 +2399,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm5 @@ -2422,7 +2424,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm13 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 @@ -4116,7 +4118,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm4 @@ -4194,7 +4196,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm13 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm9 @@ -4262,7 +4264,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm5 @@ -4356,7 +4358,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> @@ -5221,7 +5223,8 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %zmm26 ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 ; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm24 ; AVX512F-FAST-NEXT: vpermd %ymm24, %ymm1, %ymm10 @@ -5265,7 +5268,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpmovqw %zmm23, %xmm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm14 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 6b0d0a9e7662f..888dbf5da722f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -448,22 +448,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i16_stride5_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,11,u,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,5,10,u,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0] ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,7,12,17,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17] ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,8,13,18,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18] ; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = <4,9,14,19,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19] ; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 ; AVX512BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) @@ -1365,9 +1365,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -1380,9 +1381,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm9 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] @@ -1425,7 +1427,8 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,5,7,0,2,5,7] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -1627,10 +1630,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,5,0,0,3,5,0] +; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] @@ -1643,9 +1647,10 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,3,6,0,1,3,6,0] +; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] @@ -1688,7 +1693,8 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,5,7,0,2,5,7] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] @@ -2731,7 +2737,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 @@ -2846,7 +2853,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] ; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm11, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm14, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm8[4,5,6,7] @@ -2859,7 +2866,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,2,3,1,3,6,7] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] @@ -2923,7 +2930,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] @@ -2949,14 +2956,15 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload @@ -3168,7 +3176,8 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 @@ -5704,7 +5713,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 @@ -5938,7 +5948,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,3,1,3,0,3,5,7] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7],ymm8[8,9,10,11,12],ymm3[13,14,15] @@ -5968,7 +5978,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,3,2,3,1,3,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] @@ -6077,7 +6087,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm6[0,1,2,3,4],ymm10[5,6,7],ymm6[8,9,10,11,12],ymm10[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] @@ -6143,12 +6153,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,u,u,6,0,3,5> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,2,1,3,0,2,5,7] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15] @@ -6604,7 +6615,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm1 @@ -7097,7 +7109,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] ; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] @@ -7295,7 +7307,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vporq %ymm3, %ymm0, %ymm19 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm13, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm17 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,3,12,13,2,3,12,13,2,3,12,13,2,3,12,13] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm11 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm11, %xmm9 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] @@ -7366,7 +7378,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm22 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15] ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm11, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm11, %xmm30 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm17[2],xmm1[3],xmm17[3] @@ -7445,7 +7457,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm7 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0],xmm7[1],xmm13[2,3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = <1,3,6,0,5,u,u,u> ; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm9 @@ -7477,7 +7489,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm7[2],xmm13[3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm9[1,2],ymm14[3],ymm9[4],ymm14[5],ymm9[6,7],ymm14[8],ymm9[9,10],ymm14[11],ymm9[12],ymm14[13],ymm9[14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u> ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 59ff5b4f8f45b..e7ab8a895b2c8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -240,7 +240,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw 4(%rdi), %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,9,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,9,3,9,3,9,3,9] ; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm5 ; AVX512BW-FAST-NEXT: vpbroadcastw 20(%rdi), %xmm6 ; AVX512BW-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm7 @@ -528,19 +528,19 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i16_stride6_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,6,12,18,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,12,18,0,6,12,18] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,7,13,19,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,7,13,19,1,7,13,19] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,8,14,20,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,8,14,20,2,8,14,20] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,9,15,21,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,9,15,21,3,9,15,21] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = <4,10,16,22,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,10,16,22,4,10,16,22] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = <5,11,17,23,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,11,17,23,5,11,17,23] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vmovq %xmm3, (%rdx) @@ -3362,7 +3362,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm7 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3 @@ -3480,7 +3480,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5],xmm7[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 @@ -3498,7 +3498,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3] @@ -3580,7 +3580,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2 @@ -3600,7 +3600,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm2 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] @@ -3611,7 +3611,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 @@ -3648,7 +3648,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = @@ -3692,7 +3692,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm13 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] @@ -3732,7 +3732,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7] @@ -3758,7 +3758,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm8 @@ -3777,7 +3777,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 @@ -3858,7 +3858,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm13, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm2, %xmm2 @@ -3878,7 +3878,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm10, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm8[1],ymm5[2,3,4,5],ymm8[6],ymm5[7] @@ -3889,7 +3889,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 @@ -3926,7 +3926,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm10, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3],xmm10[4,5],xmm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = @@ -3970,7 +3970,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,1,2,0,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0],xmm13[1,2],xmm15[3],xmm13[4,5,6,7] @@ -4010,7 +4010,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5,6],xmm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm10, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2],xmm9[3],xmm10[4,5,6,7] @@ -4036,7 +4036,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm8 @@ -4055,7 +4055,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm1 @@ -4159,7 +4159,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm10, %ymm30 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm4 @@ -4294,7 +4294,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1] ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -4334,7 +4334,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,3] @@ -4387,7 +4387,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm3 @@ -4396,7 +4396,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm15 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> @@ -4446,7 +4446,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm9[2],xmm4[3],xmm9[4,5],xmm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> @@ -4573,7 +4573,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm9 @@ -4608,7 +4608,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm9 @@ -4618,7 +4618,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm6, %xmm8, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm1 @@ -4725,7 +4725,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512DQ-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 @@ -4856,7 +4856,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm4 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] ; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -4896,7 +4896,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3],xmm5[4],xmm0[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3] @@ -4943,7 +4943,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm14 = xmm3[2,1,0,3] ; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm3 @@ -4952,7 +4952,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm7 ; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm4 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm10 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> @@ -5005,7 +5005,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm14[2],xmm5[3],xmm14[4,5],xmm5[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> @@ -5129,7 +5129,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 @@ -5164,7 +5164,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm10, %xmm10 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 @@ -5174,7 +5174,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm15, %xmm8 ; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> ; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 @@ -7725,7 +7725,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7] @@ -7968,7 +7968,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm8 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6,7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -8010,7 +8010,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3] @@ -8166,7 +8166,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm11 ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm12, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm2 @@ -8203,7 +8203,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] @@ -8220,7 +8220,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 @@ -8319,7 +8319,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm15 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7] @@ -8401,7 +8401,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm10 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7] @@ -8511,7 +8511,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7] @@ -8571,7 +8571,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm9 @@ -8612,7 +8612,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 @@ -8766,7 +8766,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm12, %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm11 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm9, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm2 @@ -8803,7 +8803,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7] @@ -8820,7 +8820,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 @@ -8919,7 +8919,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm0, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7] @@ -9001,7 +9001,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm7, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,2,0,4,5,6,7] @@ -9111,7 +9111,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm9 = mem[0,1,2,3,7,5,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5,6],xmm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1,2],xmm4[3],xmm7[4,5,6,7] @@ -9171,7 +9171,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,1] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm9 @@ -9212,7 +9212,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 @@ -9425,7 +9425,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6] ; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm13, %xmm15, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm15, %xmm14, %xmm14 @@ -9709,7 +9709,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm5, %xmm4 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7] @@ -9809,7 +9809,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[1,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1],xmm2[2,3],xmm10[4],xmm2[5,6,7] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm22, %xmm1 @@ -9919,7 +9919,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] @@ -9931,7 +9931,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm15, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5> @@ -10053,7 +10053,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm13, %xmm0 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> @@ -10140,7 +10140,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm20 @@ -10254,7 +10254,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm24 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm11, %xmm20 @@ -10330,7 +10330,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7] ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm25 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> @@ -10426,7 +10426,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm1[5,6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm11 @@ -10440,7 +10440,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm11 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> @@ -10677,7 +10677,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %zmm17, %zmm16, %zmm6 ; AVX512DQ-SLOW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1} ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm13 @@ -10954,7 +10954,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm4, %xmm19 ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm5, %xmm4 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7] @@ -11053,7 +11053,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3] ; AVX512DQ-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm18, %xmm0 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %xmm19, %xmm13 @@ -11145,7 +11145,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3] @@ -11157,7 +11157,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 @@ -11283,7 +11283,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15> ; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm12, %xmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0] ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm7 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7> @@ -11371,7 +11371,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0] ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] ; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm24 @@ -11485,7 +11485,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7] ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm14, %zmm20 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm24, %xmm0 ; AVX512DQ-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm15 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm11, %xmm22 @@ -11552,7 +11552,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13] ; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u> @@ -11647,7 +11647,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm9 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15> ; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm11, %xmm11 @@ -11660,7 +11660,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm11 ; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm10 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15] ; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm4 ; AVX512DQ-FAST-NEXT: vpshufb %xmm10, %xmm4, %xmm4 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index e5674bc467e0d..56f269c2441c5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -270,7 +270,7 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpsrlq $48, %xmm1, %xmm8 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512BW-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <6,13,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [6,13,6,13,6,13,6,13] ; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm8 ; AVX512BW-FAST-NEXT: vmovd %xmm2, (%rsi) ; AVX512BW-FAST-NEXT: vmovd %xmm4, (%rdx) @@ -688,21 +688,21 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,14,21,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,7,14,21,0,7,14,21] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,8,15,22,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,8,15,22,1,8,15,22] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,9,16,23,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,9,16,23,2,9,16,23] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,10,17,24,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,10,17,24,3,10,17,24] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = <4,11,18,25,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,11,18,25,4,11,18,25] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = <5,12,19,26,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,12,19,26,5,12,19,26] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,13,20,27,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,13,20,27,6,13,20,27] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vmovq %xmm3, (%rdx) @@ -1389,7 +1389,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] @@ -2352,7 +2352,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5] @@ -2453,7 +2453,8 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,5,1,4,2,5,1,4] ; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm15, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [0,3,7,0,0,3,7,0] +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm15, %ymm15 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] @@ -2521,7 +2522,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm7, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7] @@ -2839,153 +2840,307 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; -; AVX512F-FAST-LABEL: load_i16_stride7_vf16: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] -; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7] -; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] -; AVX512F-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] -; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX512F-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa %ymm10, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm7, (%r8) -; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%r9) -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa %ymm9, (%rax) -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rax) -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: load_i16_stride7_vf16: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm7, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm8, (%r9) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, (%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-FAST-LABEL: load_i16_stride7_vf16: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [10,3,6,15,12,13,6,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u> +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm2[0,1,0,2] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm14[4],xmm12[5],xmm14[6],xmm12[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm12, %ymm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3],xmm15[4],xmm14[5],xmm15[6],xmm14[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm14, %ymm6 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3,4,5],xmm13[6],xmm14[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm14 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm10, %ymm14, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0],ymm13[1,2,3,4,5,6,7],ymm10[8],ymm13[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5],xmm14[6],xmm13[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2],xmm15[3],xmm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm16, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1,2,3,4,5,6,7],ymm7[8],ymm13[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm12 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm14, %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm9[1,2,3,4,5,6,7],ymm8[8],ymm9[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,3,3,u,0,3,7,u> +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm15[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6,7],ymm11[8,9,10,11,12],ymm9[13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u> +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm1, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,4,7,0,0,4,7,0] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,2,3,4,5,10,11,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,2,3,4,5,10,11,12,13,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm10, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm7, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa %ymm8, (%r9) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, (%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: load_i16_stride7_vf16: ; AVX512BW: # %bb.0: @@ -2995,42 +3150,49 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47] +; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 @@ -5026,7 +5188,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm14 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload @@ -5054,7 +5216,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -5106,7 +5268,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] @@ -5180,7 +5342,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,5,1,4,2,5,1,4] @@ -5231,12 +5393,12 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -5426,7 +5588,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm7[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, (%rsp), %ymm14, %ymm0 # 32-byte Folded Reload @@ -5453,7 +5615,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -5478,7 +5640,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm7[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm12 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -5503,7 +5665,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm7[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm3, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] @@ -5544,7 +5706,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] @@ -6090,11 +6252,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST: # %bb.0: ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm29 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm25 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [10,3,6,15,12,13,6,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = <1,u,u,u,5,8,12,15> ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u> @@ -6139,7 +6304,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2],xmm13[3,4,5,6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm4, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 @@ -6292,7 +6457,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm24 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm25, %zmm1, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25 @@ -6757,13 +6923,16 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST: # %bb.0: ; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm30 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm31 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm31 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm24 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,3,6,15,12,13,6,15] ; AVX512DQ-FAST-NEXT: vpermd %zmm30, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm20 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = <1,u,u,u,5,8,12,15> ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u> @@ -6808,7 +6977,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3,4,5,6],xmm14[7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm13 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm25 @@ -11195,7 +11364,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -11249,7 +11418,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -11364,7 +11533,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -11416,7 +11585,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -11539,13 +11708,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] @@ -11617,12 +11786,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,4,7,0,0,4,7,0] ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,6,1,5,2,6,1,5] ; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -12003,7 +12172,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,1,0,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm14 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -12067,7 +12236,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -12119,7 +12288,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm6[0,1,1,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm4 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -12178,7 +12347,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -12245,7 +12414,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] @@ -13048,7 +13217,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-SLOW-NEXT: vmovdqa %ymm2, %ymm14 ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6],ymm8[7,8],ymm1[9,10,11,12,13,14],ymm8[15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 @@ -13581,7 +13750,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7] @@ -13703,11 +13872,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm15 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,1,1,3] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm6 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4 ; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm0 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -13738,7 +13907,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] ; AVX512F-ONLY-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6 @@ -13749,7 +13918,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [3,6,10,13,3,6,10,13] +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm19, %zmm25, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm11, %ymm6, %ymm6 @@ -13775,7 +13945,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,9,12,2,5,9,12] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm8, %zmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm7 @@ -13792,7 +13963,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm3, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3],xmm7[4],xmm3[5],xmm7[6,7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm3, %xmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm29 @@ -13825,7 +13996,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,6,9,13,2,6,9,13] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm1, %zmm11, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 @@ -13839,7 +14011,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm3, %zmm8, %zmm4 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm4[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13864,7 +14036,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6],xmm8[7] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] ; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,u,u,u,4,8,11,15> ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm16, %zmm15, %zmm0 @@ -13873,7 +14045,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2],xmm7[3,4,5,6],xmm0[7] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: vpermd %zmm14, %zmm11, %zmm7 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm7, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14655,7 +14827,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1] ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512DQ-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] ; AVX512DQ-SLOW-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21] ; AVX512DQ-SLOW-NEXT: vpshufb %ymm7, %ymm12, %ymm12 @@ -15182,7 +15354,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29] ; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm7 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm12[1],xmm0[2,3,4,5,6,7] @@ -15305,11 +15477,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm12 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25] ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm23[0,1,1,3] ; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm14 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm12[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11] ; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm14, %zmm21 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] @@ -15338,7 +15510,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27] ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQ-FAST-NEXT: vpbroadcastw 232(%rdi), %xmm6 @@ -15349,7 +15521,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 ; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm19 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermd %zmm19, %zmm1, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u> @@ -15376,7 +15549,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,5,9,12,2,5,9,12] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm5 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31] ; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 @@ -15393,7 +15567,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm5, %ymm20 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4],xmm1[5],xmm5[6,7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11] ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = <0,u,u,u,4,7,11,14> @@ -15423,7 +15597,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermd %ymm23, %ymm12, %ymm2 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13] +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm10, %zmm0 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29] ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] @@ -15434,7 +15609,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm0 ; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm6, %zmm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -15458,7 +15633,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm5 ; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm6, %xmm14 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3],xmm6[4],xmm14[5],xmm6[6],xmm14[7] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13] ; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm6, %xmm6 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <1,u,u,u,4,8,11,15> ; AVX512DQ-FAST-NEXT: vpermd %zmm17, %zmm7, %zmm15 @@ -15467,7 +15642,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4,5,6],xmm15[7] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm10, %zmm15 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29] ; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm15 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm15[6,7] ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -15496,7 +15671,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128] ; AVX512DQ-FAST-NEXT: vpshufb %ymm14, %ymm6, %ymm6 ; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm1, %ymm1 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,10,13,3,6,10,13] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermd %zmm16, %zmm5, %zmm6 ; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index 9c533a00de734..7032733e67c49 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -271,23 +271,23 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,17,25,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,9,17,25,1,9,17,25] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <2,10,18,26,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,10,18,26,2,10,18,26] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,19,27,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,11,19,27,3,11,19,27] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = <4,12,20,28,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,12,20,28,4,12,20,28] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = <5,13,21,29,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,13,21,29,5,13,21,29] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = <6,14,22,30,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,14,22,30,6,14,22,30] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = <7,15,23,31,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [7,15,23,31,7,15,23,31] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vmovq %xmm3, (%rdx) @@ -541,7 +541,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] @@ -560,7 +560,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <3,7,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] ; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -602,7 +602,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm2, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm8 @@ -614,7 +614,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <1,5,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm14 = [1,5,1,5] ; AVX512F-FAST-NEXT: vmovdqa %xmm13, %xmm15 ; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm14, %xmm15 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] @@ -622,11 +622,11 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 ; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm15 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm15 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <3,7,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] ; AVX512F-FAST-NEXT: vpermt2d %xmm12, %xmm15, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm13[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] @@ -1285,7 +1285,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm8[3] @@ -1350,7 +1350,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <3,7,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] ; AVX512F-SLOW-NEXT: vpermt2d %xmm14, %xmm17, %xmm15 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -1453,7 +1453,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,4,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm7, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 @@ -1487,7 +1487,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <1,5,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm13 = [1,5,1,5] ; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm13, %xmm1 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm14[0],xmm11[1],xmm14[1] @@ -1503,7 +1503,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 ; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [2,6,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm11, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm12[2],xmm4[2],xmm12[3],xmm4[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm0[2,3] @@ -1520,7 +1520,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm19 = <3,7,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm19 = [3,7,3,7] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm19, %xmm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -1621,42 +1621,50 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63] ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 @@ -3071,7 +3079,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] @@ -3231,7 +3239,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <3,7,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [3,7,3,7] ; AVX512F-SLOW-NEXT: vpermt2d %xmm9, %xmm8, %xmm5 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: # xmm1 = xmm5[0,1],mem[2,3] @@ -3269,7 +3277,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpermt2d %xmm0, %xmm2, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload @@ -3397,7 +3405,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <3,7,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7] ; AVX512F-SLOW-NEXT: vpermt2d %xmm18, %xmm12, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] @@ -3454,7 +3462,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 ; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm1, %xmm0 @@ -3549,7 +3557,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm13 @@ -3581,7 +3589,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm18[2],xmm5[2],xmm18[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm20[2],xmm21[3],xmm20[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] @@ -3617,7 +3625,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm29 = <3,7,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm29 = [3,7,3,7] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm29, %xmm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] @@ -3655,7 +3663,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm6, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,4,0,4] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm20 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -3719,7 +3727,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm28 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <1,5,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm8 = [1,5,1,5] ; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm8, %xmm0 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm20[0],xmm6[1],xmm20[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -3747,7 +3755,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm20[2],xmm6[3],xmm20[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm4 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm30[2],xmm3[3],xmm30[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] @@ -3844,7 +3852,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] @@ -3856,7 +3865,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm10 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] @@ -3868,7 +3878,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -3880,7 +3891,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] @@ -3892,7 +3904,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm13 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] @@ -3904,7 +3917,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm14 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] @@ -3916,7 +3930,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm15 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] @@ -3926,7 +3941,8 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm15, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -6894,7 +6910,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2] @@ -7288,7 +7304,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm16 = <3,7,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm0 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -7363,7 +7379,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,4,0,4] ; AVX512F-SLOW-NEXT: vpermt2d %xmm5, %xmm4, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload @@ -7691,7 +7707,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm16 = <3,7,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm1 ; AVX512F-SLOW-NEXT: vpermt2d %xmm20, %xmm16, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -7799,7 +7815,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,4,0,4] ; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 ; AVX512F-FAST-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 @@ -8034,7 +8050,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm8 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <1,5,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [1,5,1,5] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpermt2d %xmm30, %xmm11, %xmm8 @@ -8105,7 +8121,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, %xmm4 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm22[2],xmm30[2],xmm22[3],xmm30[3] @@ -8208,7 +8224,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm18 = <3,7,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm18, %xmm0 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -8281,7 +8297,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,4,0,4] ; AVX512F-FAST-NEXT: vpermt2d %xmm4, %xmm5, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm16 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] @@ -8464,7 +8480,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <1,5,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] ; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm15, %xmm0 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm16[0],xmm9[1],xmm16[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -8530,7 +8546,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm16[2],xmm9[3],xmm16[3] ; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,6,2,6] ; AVX512F-FAST-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm27 ; AVX512F-FAST-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload @@ -8635,7 +8651,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} xmm24 = <3,7,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm24 = [3,7,3,7] ; AVX512F-FAST-NEXT: vpermt2d %xmm27, %xmm24, %xmm1 ; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm1 = xmm1[0,1],mem[2,3] @@ -8729,282 +8745,1149 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; -; AVX512BW-LABEL: load_i16_stride8_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 -; AVX512BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 -; AVX512BW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = -; AVX512BW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 -; AVX512BW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: addq $1032, %rsp # imm = 0x408 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-SLOW-LABEL: load_i16_stride8_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: load_i16_stride8_vf64: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512BW-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: load_i16_stride8_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: movb $-64, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-SLOW-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: load_i16_stride8_vf64: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: movb $-64, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm7, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm9, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm2, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm7, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm19, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm10, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm15, %zmm17, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm17, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm12, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm14, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm5, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm22, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm2, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm12, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm14, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm0, %zmm17, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm11, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57] +; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59] +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm29, %zmm10, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61] +; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm5, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63] +; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm11, %zmm18, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2w %zmm31, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2w %zmm31, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQBW-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <512 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> %strided.vec1 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> @@ -9032,13 +9915,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW: {{.*}} ; AVX512: {{.*}} ; AVX512BW-FAST: {{.*}} -; AVX512BW-ONLY-FAST: {{.*}} -; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512BW-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} -; AVX512DQBW-FAST: {{.*}} -; AVX512DQBW-SLOW: {{.*}} ; AVX512F: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index a6e64e1d8f6d0..7d3f515e259e9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -78,7 +78,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <5,0,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0] ; AVX512F-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 ; AVX512F-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -106,7 +106,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <5,0,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0] ; AVX512BW-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 ; AVX512BW-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 8b1aae61ed5c3..25254def1809e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -78,7 +78,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] ; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovq %xmm2, (%rsi) @@ -106,7 +106,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] ; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index d28215f89fdc1..bcffe374f4b87 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -145,7 +145,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] ; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5] ; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 ; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] ; AVX512F-FAST-NEXT: # xmm1 = mem[0,0] @@ -207,7 +207,7 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] ; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <3,5,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5] ; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 ; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] ; AVX512BW-FAST-NEXT: # xmm1 = mem[0,0] @@ -367,13 +367,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,2,u,u> +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [4,2,4,2] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <5,3,u,u> +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,3,5,3] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) @@ -901,12 +901,14 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11] ; AVX512-NEXT: vpermi2d %ymm4, %ymm5, %ymm7 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12] +; AVX512-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <2,8,14,20,26,u,u,u> ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13] +; AVX512-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <3,9,15,21,27,u,u,u> ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index f9713d1eab16c..a8b737fc81732 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -133,7 +133,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,11,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11] ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512F-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 @@ -164,9 +164,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <7,2,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2] ; AVX512F-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,11,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11] ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 @@ -201,7 +201,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <4,11,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11] ; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512BW-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 @@ -232,9 +232,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <7,2,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2] ; AVX512BW-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,11,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11] ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512BW-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 @@ -911,7 +911,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <4,3,u,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -986,7 +986,8 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,6,5,6,5,6,5,6] ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,0,7,u,u,u,u,u> +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,0,7,0,1,0,7,0] +; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] @@ -1014,7 +1015,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <4,3,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1025,7 +1026,8 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,0,7,0,1,0,7,0] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] @@ -1117,7 +1119,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <4,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1169,37 +1171,44 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17] +; AVX512-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u> ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18] +; AVX512-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,u,u,u> ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19] +; AVX512-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u> ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20] +; AVX512-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u> ; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21] +; AVX512-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,11,18,25] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,8,15,22,1,8,15,22] +; AVX512-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,12,19,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,9,16,23,2,9,16,23] +; AVX512-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,13,20,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 @@ -1923,7 +1932,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,3,u,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 ; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10 @@ -2149,7 +2158,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,3,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 @@ -2376,7 +2385,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <4,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10 @@ -4815,7 +4824,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,3,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 5f6de8f4b8bf4..b161f24e01f2f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -153,7 +153,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] ; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 @@ -220,7 +220,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] ; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512BW-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 @@ -817,42 +817,50 @@ define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,0,8,16,24] +; AVX512-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,16,24] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,1,9,17,25] +; AVX512-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [1,9,17,25] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,2,10,18,26] +; AVX512-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,10,18,26] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,3,11,19,27] +; AVX512-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [3,11,19,27] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,4,12,20,28] +; AVX512-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,12,20,28] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,5,13,21,29] +; AVX512-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [5,13,21,29] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,6,14,22,30] +; AVX512-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [6,14,22,30] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,7,15,23,31] +; AVX512-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [7,15,23,31] ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 @@ -1694,7 +1702,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: movb $-64, %dil ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] @@ -1706,7 +1715,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25] +; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm10 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] @@ -1718,7 +1728,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 ; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm11 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -1730,7 +1741,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 ; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27] +; AVX512F-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm12 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] @@ -1742,7 +1754,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28] +; AVX512F-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm13 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] @@ -1754,7 +1767,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29] +; AVX512F-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm14 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] @@ -1766,7 +1780,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30] +; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm15 ; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] @@ -1776,7 +1791,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512F-NEXT: vpermt2d %zmm5, %zmm15, %zmm2 ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512F-NEXT: vpermt2d %zmm1, %zmm15, %zmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -1814,7 +1830,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: movb $-64, %dil ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm9 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] @@ -1826,7 +1843,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm10 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] @@ -1838,7 +1856,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm11 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -1850,7 +1869,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm12 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] @@ -1862,7 +1882,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm13 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7] @@ -1874,7 +1895,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm14 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm13 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] @@ -1886,7 +1908,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm15 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] @@ -1896,7 +1919,8 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm4, %zmm3, %zmm5 ; AVX512BW-NEXT: vpermt2d %zmm1, %zmm15, %zmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -3727,559 +3751,2293 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: load_i32_stride8_vf32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 -; AVX512F-NEXT: movb $-64, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512F-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512F-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512F-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512F-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = -; AVX512F-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512F-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512F-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512F-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm7, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm7, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512F-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512F-NEXT: addq $1032, %rsp # imm = 0x408 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf32: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: load_i32_stride8_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm25 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 -; AVX512BW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512BW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm18 = -; AVX512BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 -; AVX512BW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 -; AVX512BW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} -; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm7, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm7, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm7, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm29, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512BW-NEXT: addq $1032, %rsp # imm = 0x408 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf32: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: movb $-64, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf32: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: movb $-64, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm7, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: load_i32_stride8_vf32: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: movb $-64, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm7, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf32: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm7, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf32: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm7, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf32: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: movb $-64, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm7, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf32: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: movb $-64, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm2, %zmm6, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm17 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm4, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm10, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm17, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm4, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm17, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm1, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm12, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm14, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm5, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm2, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm12, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm14, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm17, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm11, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm29, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,10,18,26,2,10,18,26] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm19, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm10, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm22 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm18 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # ymm18 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm11, %zmm18, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm31, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm24, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = mem[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm25, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm28, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm30, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm7, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQBW-FAST-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <256 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> %strided.vec1 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> @@ -8145,1193 +9903,4829 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: load_i32_stride8_vf64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512F-NEXT: movb $-64, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512F-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512F-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512F-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512F-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512F-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512F-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512F-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512F-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512F-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512F-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512F-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512F-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512F-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = -; AVX512F-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512F-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512F-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512F-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512F-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, (%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, (%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512F-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: load_i32_stride8_vf64: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: load_i32_stride8_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $3304, %rsp # imm = 0xCE8 -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm27 -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm26 -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm31 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm27 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} -; AVX512BW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 -; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm26 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 -; AVX512BW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm28 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 -; AVX512BW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 -; AVX512BW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 -; AVX512BW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 -; AVX512BW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm20 -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = -; AVX512BW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 -; AVX512BW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512BW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 -; AVX512BW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} -; AVX512BW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} -; AVX512BW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, (%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 192(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, (%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm5, 128(%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512BW-NEXT: addq $3304, %rsp # imm = 0xCE8 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: load_i32_stride8_vf64: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: movb $-64, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: load_i32_stride8_vf64: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: movb $-64, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, (%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: load_i32_stride8_vf64: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: movb $-64, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, (%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: load_i32_stride8_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: load_i32_stride8_vf64: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: load_i32_stride8_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: movb $-64, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQBW-SLOW-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: load_i32_stride8_vf64: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $3304, %rsp # imm = 0xCE8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: movb $-64, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,8,16,24,0,8,16,24] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm5, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm8, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm4, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9,17,25,1,9,17,25] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm16, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm9, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,10,18,26,2,10,18,26] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm3, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm17, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm7, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,11,19,27,3,11,19,27] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm15, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm21, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm27 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm31, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm13, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm4, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,20,28,4,12,20,28] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm23, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm14, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm0, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm21, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm6, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm18, %zmm3, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm16, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm19, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm30, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm24, %zmm25, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm3, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm5, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm28, %zmm25, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm3, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm0, %zmm25, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm3, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm5, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm9, %zmm25, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm5, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm29, %zmm25, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm3, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm5, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm12, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm12, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm27, %zmm25, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm20, %zmm25, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm25, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm25 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,21,29,5,13,21,29] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm28, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm29, %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,22,30,6,14,22,30] +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm3, %zmm15 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,23,31,7,15,23,31] +; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm2, %zmm16, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm26, %zmm16, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2d %zmm22, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm22, %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2d %zmm1, %zmm16, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm10 = mem[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm11, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm30, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, (%rsp), %ymm14, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm21, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm20, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 192(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, (%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%r9) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm5, 128(%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQBW-FAST-NEXT: addq $3304, %rsp # imm = 0xCE8 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <512 x i32>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> %strided.vec1 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> @@ -9358,14 +14752,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} -; AVX512BW-ONLY-FAST: {{.*}} -; AVX512BW-ONLY-SLOW: {{.*}} -; AVX512DQ-FAST: {{.*}} -; AVX512DQ-SLOW: {{.*}} -; AVX512DQBW-FAST: {{.*}} -; AVX512DQBW-SLOW: {{.*}} -; AVX512F-ONLY-FAST: {{.*}} -; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 3eca48fbddbce..f14a54a8e93ca 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -279,7 +279,8 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <11,1,7,u> ; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] +; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] ; AVX512F-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 @@ -362,7 +363,8 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <11,1,7,u> ; AVX512BW-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6] +; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpermi2q %ymm7, %ymm4, %ymm8 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10] ; AVX512BW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 49a2bfaeb0539..d47558166f061 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -332,7 +332,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512F-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512F-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 @@ -362,7 +363,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512F-FAST-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512F-FAST-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 ; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -384,7 +386,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512F-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] +; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512F-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 @@ -437,7 +440,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512BW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] +; AVX512BW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-SLOW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512BW-SLOW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 @@ -467,7 +471,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm1 ; AVX512BW-FAST-NEXT: vpbroadcastq 176(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,7,0,7] +; AVX512BW-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 ; AVX512BW-FAST-NEXT: vpermi2q 160(%rdi), %ymm6, %ymm2 ; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -489,7 +494,8 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = [5,12] ; AVX512BW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm9 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,11,4,11] +; AVX512BW-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9 ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13] ; AVX512BW-FAST-NEXT: vpermi2q %zmm4, %zmm5, %zmm3 @@ -900,7 +906,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm15 = [4,11] ; AVX512F-NEXT: vpermi2q %zmm2, %zmm5, %zmm15 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,11,4,11] +; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm6, %zmm7, %zmm15 ; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> @@ -1009,7 +1016,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [4,11] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm5, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,11,4,11] +; AVX512BW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm7, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm7 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = <9,0,7,u> @@ -1976,7 +1984,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15] ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512F-NEXT: vpermt2q %zmm27, %zmm11, %zmm10 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm1 @@ -2161,7 +2170,8 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [4,5,8,15,4,5,8,15] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm8, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,11,4,11] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm11, %zmm10 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,13] ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm1 @@ -3776,880 +3786,3513 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: load_i64_stride7_vf32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2152, %rsp # imm = 0x868 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm17 -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm16 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm19 -; AVX512F-NEXT: vmovaps 1024(%rdi), %zmm1 -; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512F-NEXT: vmovaps 576(%rdi), %zmm3 -; AVX512F-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm18 -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm3 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] -; AVX512F-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm30 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm29 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 -; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 -; AVX512F-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 -; AVX512F-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 -; AVX512F-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: movb $24, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm24 = -; AVX512F-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm5 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1] -; AVX512F-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512F-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 -; AVX512F-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512F-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %ymm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 -; AVX512F-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512F-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512F-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 -; AVX512F-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa 1088(%rdi), %ymm0 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] -; AVX512F-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 -; AVX512F-NEXT: vmovdqa 960(%rdi), %ymm11 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm11 -; AVX512F-NEXT: vmovdqa 512(%rdi), %ymm13 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 -; AVX512F-NEXT: movb $-32, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} -; AVX512F-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} -; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} -; AVX512F-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512F-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm11 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512F-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm11 -; AVX512F-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqa 1536(%rdi), %ymm11 -; AVX512F-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm7, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512F-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512F-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512F-NEXT: vmovdqa64 %zmm3, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm2, 128(%r9) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm12, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-NEXT: vmovaps %zmm9, 64(%rax) -; AVX512F-NEXT: addq $2152, %rsp # imm = 0x868 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: load_i64_stride7_vf32: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] +; AVX512F-ONLY-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: movb $24, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] +; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] +; AVX512F-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: movb $-32, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm9, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: load_i64_stride7_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2120, %rsp # imm = 0x848 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-NEXT: vmovaps 576(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm28 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 -; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: movb $24, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1] -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 -; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 -; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 -; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm6 -; AVX512BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX512BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 -; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm1 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 -; AVX512BW-NEXT: movb $-32, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} -; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm2 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa 1408(%rdi), %ymm13 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13 -; AVX512BW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm16 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm8 -; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 -; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 -; AVX512BW-NEXT: vmovdqa64 %zmm15, 192(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 128(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm27, (%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%rcx) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm2, 128(%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-NEXT: vmovaps %zmm8, 64(%rax) -; AVX512BW-NEXT: addq $2120, %rsp # imm = 0x848 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: load_i64_stride7_vf32: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovaps 1024(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] +; AVX512F-ONLY-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: movb $24, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] +; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512F-ONLY-FAST-NEXT: movb $-32, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm9, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: load_i64_stride7_vf32: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovaps 1024(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovaps 576(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] +; AVX512DQ-SLOW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] +; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: movb $24, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] +; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] +; AVX512DQ-SLOW-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa 960(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512DQ-SLOW-NEXT: movb $-32, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512DQ-SLOW-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, (%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm9, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: load_i64_stride7_vf32: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $2152, %rsp # imm = 0x868 +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovaps 1024(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovaps 576(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [10,3,10,3,10,3,10,3] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [13,6,13,6,13,6,13,6] +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5] +; AVX512DQ-FAST-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm22, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm22, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm30 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm26, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm20, %zmm15 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm22, %zmm21 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm4, %zmm31 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm11 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm20, %zmm25 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm17, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: movb $24, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm17 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0] +; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm22 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm15, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm16 {%k1} = zmm1[0,1,4,5],zmm29[4,5,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm18, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm8[0,1,4,5],zmm13[4,5,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm13, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm18, %zmm19 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm13, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm6[0,1,4,5],zmm9[4,5,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm27, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm8[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 1088(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [6,13] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm8, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa 960(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm8, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm8, %zmm12 +; AVX512DQ-FAST-NEXT: movb $-32, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm29 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm2 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm23 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm30 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm27 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm24 {%k2} +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm11, %zmm19, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm3 {%k2} +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm22, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm4 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd $240, (%rsp), %ymm10, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = ymm10[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 {%k2} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm25, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm18, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 {%k2} +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = ymm9[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm9, %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = ymm14[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, (%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm9, 64(%rax) +; AVX512DQ-FAST-NEXT: addq $2152, %rsp # imm = 0x868 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride7_vf32: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] +; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: movb $24, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] +; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: movb $-32, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: load_i64_stride7_vf32: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovaps 576(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] +; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] +; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: movb $24, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [4,11,4,11] +; AVX512BW-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: movb $-32, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: load_i64_stride7_vf32: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovaps 576(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] +; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: movb $24, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] +; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 912(%rdi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512DQBW-SLOW-NEXT: movb $-32, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %ymm16 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 128(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 64(%rax) +; AVX512DQBW-SLOW-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: load_i64_stride7_vf32: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $2120, %rsp # imm = 0x848 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovaps 576(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [11,4,11,4,11,4,11,4] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [12,5,12,5,12,5,12,5] +; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm19, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm19, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm23 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [9,0,7,0,9,0,7,0] +; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm29, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm29, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm31, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm15, %zmm4, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm29, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm15, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,9,0,5,6,9] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,10,0,5,6,10] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,11,0,5,6,11] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,5,6,12,0,5,6,12] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,13,4,5,6,13] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm26 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,8,15,4,5,8,15] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm3, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: movb $24, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <0,7,14,u> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm15, %zmm17 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm14[0,1,4,5],zmm12[4,5,0,1] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm28 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0] +; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm16, %zmm19 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [4,11,4,11] +; AVX512DQBW-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm24, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[0,1,4,5],zmm7[4,5,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm16, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm15 {%k1} = zmm6[0,1,4,5],zmm13[4,5,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm13, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm13, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm24, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm10[0,1,4,5],zmm30[4,5,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm24, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 912(%rdi), %xmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm1, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa 464(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm28, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm9, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa 1360(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm0[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [4,11] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 576(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 1472(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = <9,0,7,u> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm26, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [5,12] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [6,13] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm26, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm7, %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm2, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa 1088(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm20, %zmm18 +; AVX512DQBW-FAST-NEXT: movb $-32, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm30 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm24 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 960(%rdi), %ymm2 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm2, %zmm22, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm4, %zmm19, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm10, %zmm21, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa 1408(%rdi), %ymm13 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $0, %xmm13, %zmm16, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %ymm16 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = ymm12[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd $240, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = ymm11[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 192(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 128(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 192(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 192(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 192(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, (%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 192(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, (%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm2, 128(%r9) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 64(%rax) +; AVX512DQBW-FAST-NEXT: addq $2120, %rsp # imm = 0x848 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <224 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> %strided.vec1 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> @@ -8696,7 +11339,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,4,11] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpermt2q %zmm6, %zmm3, %zmm11 ; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm29 # 64-byte Reload @@ -9677,7 +12321,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm6, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,4,11] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload @@ -10073,14 +12718,6 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST: {{.*}} ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} -; AVX512BW-ONLY-FAST: {{.*}} -; AVX512BW-ONLY-SLOW: {{.*}} -; AVX512DQ-FAST: {{.*}} -; AVX512DQ-SLOW: {{.*}} -; AVX512DQBW-FAST: {{.*}} -; AVX512DQBW-SLOW: {{.*}} -; AVX512F-ONLY-FAST: {{.*}} -; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} ; FALLBACK10: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index e7067d4f6bd76..439265452cc47 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -371,22 +371,26 @@ define void @load_i64_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] ; AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,4,12] +; AVX512-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm2 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [4,12] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] +; AVX512-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm11 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [5,13] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [6,14,6,14] +; AVX512-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm12 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [6,14] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm13 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [7,15,7,15] +; AVX512-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512-NEXT: vpermi2q %zmm5, %zmm6, %zmm13 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [7,15] ; AVX512-NEXT: vpermi2q %zmm3, %zmm4, %zmm5 @@ -914,7 +918,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpermt2q %zmm6, %zmm14, %zmm9 ; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} ; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 @@ -922,7 +927,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermt2q %zmm6, %zmm8, %zmm16 ; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} @@ -945,12 +951,14 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} ; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm17, %zmm6 ; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm19, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm4, %zmm20, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 @@ -1028,7 +1036,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm14, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm7 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm7, %zmm7 @@ -1036,7 +1045,8 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm8, %zmm16 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,13] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 {%k1} @@ -1059,12 +1069,14 @@ define void @load_i64_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm17 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm17, %zmm6 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm19, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm20, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 @@ -2123,7 +2135,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 ; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm14 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] @@ -2141,7 +2154,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512F-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] @@ -2186,10 +2200,12 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 ; AVX512F-NEXT: vpermi2q %zmm30, %zmm6, %zmm5 ; AVX512F-NEXT: vpermt2q %zmm30, %zmm1, %zmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 ; AVX512F-NEXT: vpermt2q %zmm25, %zmm1, %zmm14 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15] +; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm13 ; AVX512F-NEXT: vpermi2q %zmm16, %zmm0, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm16, %zmm15, %zmm0 @@ -2349,7 +2365,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7] @@ -2367,7 +2384,8 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7] @@ -2412,10 +2430,12 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm1, %zmm3 ; AVX512BW-NEXT: vpermi2q %zmm30, %zmm6, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [7,15,7,15] +; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm13 ; AVX512BW-NEXT: vpermi2q %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm15, %zmm0 @@ -4619,7 +4639,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm14 ; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -4717,7 +4738,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512F-NEXT: vpermt2q %zmm9, %zmm7, %zmm15 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] @@ -4827,12 +4849,14 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15] +; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 @@ -5151,7 +5175,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,12,4,12] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5249,7 +5274,8 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,13,5,13] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm7, %zmm15 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm15[4,5,6,7] @@ -5359,12 +5385,14 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [6,14,6,14] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [7,15,7,15] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 @@ -9640,2267 +9668,9085 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: load_i64_stride8_vf64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512F-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1984(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512F-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: movb $-64, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512F-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512F-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512F-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 3072(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 640(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512F-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 1792(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3008(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2944(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 2880(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 2752(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 2688(%rdi), %ymm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %ymm16 -; AVX512F-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512F-NEXT: vmovdqa64 2432(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2304(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 2240(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 2176(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 4032(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa64 3904(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512F-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512F-NEXT: vmovdqa 3648(%rdi), %ymm12 -; AVX512F-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512F-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 3136(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa64 2624(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa64 3648(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3584(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqa64 3776(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512F-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512F-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512F-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512F-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512F-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512F-NEXT: vmovdqa 1216(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512F-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512F-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512F-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512F-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] -; AVX512F-NEXT: vmovdqa 1600(%rdi), %xmm12 -; AVX512F-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512F-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512F-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512F-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512F-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512F-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512F-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512F-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512F-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512F-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 3264(%rdi), %xmm25 -; AVX512F-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512F-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512F-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512F-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512F-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512F-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512F-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512F-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm22, 384(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rsi) -; AVX512F-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512F-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rcx) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%r8) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%r9) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: addq $6728, %rsp # imm = 0x1A48 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: load_i64_stride8_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $6728, %rsp # imm = 0x1A48 -; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm24 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %ymm21 -; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] -; AVX512BW-NEXT: vmovdqa 3136(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 3072(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %ymm25 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm31 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm20 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 1728(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 1664(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %ymm18 -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %ymm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm28 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %ymm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm30 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %ymm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 2688(%rdi), %ymm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %ymm16 -; AVX512BW-NEXT: vmovdqa 2560(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 2240(%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 2176(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 2112(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 2048(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 4032(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3968(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 3904(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 3840(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %ymm22 -; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %ymm12 -; AVX512BW-NEXT: vmovdqa 3584(%rdi), %ymm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] -; AVX512BW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm14 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 3648(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3584(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqa64 3776(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm27 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm2 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm3 -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm18 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm7 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} -; AVX512BW-NEXT: vmovdqa 704(%rdi), %xmm8 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %xmm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %xmm18 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %xmm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} -; AVX512BW-NEXT: vmovdqa 1216(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-NEXT: vmovdqa 1152(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %xmm23 -; AVX512BW-NEXT: vmovdqa 1024(%rdi), %xmm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} -; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %xmm30 -; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %xmm31 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] -; AVX512BW-NEXT: vmovdqa 1600(%rdi), %xmm12 -; AVX512BW-NEXT: vmovdqa 1536(%rdi), %xmm11 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} -; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %xmm19 -; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %xmm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] -; AVX512BW-NEXT: vmovdqa 2112(%rdi), %xmm15 -; AVX512BW-NEXT: vmovdqa 2048(%rdi), %xmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %xmm28 -; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %xmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] -; AVX512BW-NEXT: vmovdqa 2624(%rdi), %xmm10 -; AVX512BW-NEXT: vmovdqa 2560(%rdi), %xmm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %xmm25 -; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %xmm27 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] -; AVX512BW-NEXT: vmovdqa 3136(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 3072(%rdi), %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa 3776(%rdi), %xmm7 -; AVX512BW-NEXT: vmovdqa64 3712(%rdi), %xmm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] -; AVX512BW-NEXT: vmovdqa 3648(%rdi), %xmm6 -; AVX512BW-NEXT: vmovdqa 3584(%rdi), %xmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512BW-NEXT: # xmm4 = xmm4[1],mem[1] -; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload -; AVX512BW-NEXT: # xmm8 = xmm8[1],mem[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512BW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512BW-NEXT: # xmm11 = xmm11[1],mem[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm24, 448(%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rsi) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 448(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 256(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 192(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 384(%rdx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rcx) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%r8) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: addq $6728, %rsp # imm = 0x1A48 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: load_i64_stride8_vf64: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3008(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2880(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2560(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq +; +; AVX512F-ONLY-FAST-LABEL: load_i64_stride8_vf64: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: movb $-64, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3008(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2880(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2560(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: load_i64_stride8_vf64: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: movb $-64, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21 +; AVX512DQ-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3008(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2880(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512DQ-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2560(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3776(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: load_i64_stride8_vf64: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: movb $-64, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] +; AVX512DQ-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3008(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 2880(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] +; AVX512DQ-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512DQ-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2560(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 3776(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23 +; AVX512DQ-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512DQ-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] +; AVX512DQ-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15 +; AVX512DQ-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] +; AVX512DQ-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] +; AVX512DQ-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] +; AVX512DQ-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: load_i64_stride8_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3008(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2880(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2560(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3776(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: load_i64_stride8_vf64: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3008(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2880(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2560(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3776(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: load_i64_stride8_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3328(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: movb $-64, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %ymm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3200(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 3136(%rdi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm31 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %ymm26 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %ymm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %ymm27 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3008(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2880(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 2688(%rdi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa 2560(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 2112(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 2048(%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3968(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3840(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3776(%rdi), %ymm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3072(%rdi), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2560(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2688(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2112(%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3584(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3776(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 640(%rdi), %xmm20 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 576(%rdi), %xmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 512(%rdi), %xmm16 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1088(%rdi), %xmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1728(%rdi), %xmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 1664(%rdi), %xmm31 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2240(%rdi), %xmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2176(%rdi), %xmm21 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 2112(%rdi), %xmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2752(%rdi), %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 2688(%rdi), %xmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa 2560(%rdi), %xmm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3200(%rdi), %xmm27 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 3136(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3072(%rdi), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 3776(%rdi), %xmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 3712(%rdi), %xmm26 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 3648(%rdi), %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 3584(%rdi), %xmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 448(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: load_i64_stride8_vf64: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $6728, %rsp # imm = 0x1A48 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3392(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3328(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3520(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3456(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1856(%rdi), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1984(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 832(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 768(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 960(%rdi), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 896(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: movb $-64, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %ymm21 +; AVX512DQBW-FAST-NEXT: vmovdqa 3200(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm21[0],ymm0[2],ymm21[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 3136(%rdi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 3072(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 704(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 640(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %ymm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %ymm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm23[0],ymm25[0],ymm23[2],ymm25[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm31 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm31[0],ymm0[0],ymm31[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm20[0],ymm19[2],ymm20[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1792(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 1664(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %ymm26 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm26[0],ymm18[0],ymm26[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1472(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1408(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1344(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1280(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %ymm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %ymm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm29[0],ymm28[0],ymm29[2],ymm28[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %ymm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %ymm27 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm27[0],ymm30[0],ymm27[2],ymm30[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3008(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2944(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2880(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 2752(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 2688(%rdi), %ymm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 2624(%rdi), %ymm16 +; AVX512DQBW-FAST-NEXT: vmovdqa 2560(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm9[0],ymm16[0],ymm9[2],ymm16[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2496(%rdi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2432(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2368(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2304(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 2240(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 2176(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 2112(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 2048(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 4032(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3968(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3904(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3840(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 3776(%rdi), %ymm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %ymm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm17[0],ymm22[0],ymm17[2],ymm22[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 3648(%rdi), %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 3584(%rdi), %ymm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm0[0],ymm12[0],ymm0[2],ymm12[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm14[2,3],ymm15[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm13[1],ymm21[1],ymm13[3],ymm21[3] +; AVX512DQBW-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm13[2,3],ymm14[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm13[1],mem[1],ymm13[3],mem[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm23[1],ymm25[1],ymm23[3],ymm25[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm31, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm31[1],mem[1],ymm31[3],mem[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm20[1],ymm19[3],ymm20[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm26[1],ymm18[1],ymm26[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm13 = ymm29[1],ymm28[1],ymm29[3],ymm28[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm27[1],ymm30[1],ymm27[3],ymm30[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm15[2,3],ymm13[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm14, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm16[1],ymm9[3],ymm16[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm9[2,3],ymm11[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm14, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm7 = ymm5[1],mem[1],ymm5[3],mem[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm5 = ymm8[1],mem[1],ymm8[3],mem[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm7[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm10, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm22[1],ymm17[3],ymm22[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm22[0],zmm9[0],zmm22[2],zmm9[2],zmm22[4],zmm9[4],zmm22[6],zmm9[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 3136(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3072(%rdi), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3200(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 704(%rdi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm12[0],zmm23[0],zmm12[2],zmm23[2],zmm12[4],zmm23[4],zmm12[6],zmm23[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm19[0],zmm1[2],zmm19[2],zmm1[4],zmm19[4],zmm1[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1600(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1536(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm1[0],zmm10[0],zmm1[2],zmm10[2],zmm1[4],zmm10[4],zmm1[6],zmm10[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1024(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1216(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 1152(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm30[0],zmm29[0],zmm30[2],zmm29[2],zmm30[4],zmm29[4],zmm30[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 2624(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2560(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2752(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2688(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2112(%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2048(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2240(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 2176(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm16[0],zmm30[0],zmm16[2],zmm30[2],zmm16[4],zmm30[4],zmm16[6],zmm30[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3648(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3584(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3776(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm1 = zmm8[0],mem[0],zmm8[2],mem[2],zmm8[4],mem[4],zmm8[6],mem[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm22[1],zmm9[1],zmm22[3],zmm9[3],zmm22[5],zmm9[5],zmm22[7],zmm9[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm17[1],zmm19[1],zmm17[3],zmm19[3],zmm17[5],zmm19[5],zmm17[7],zmm19[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm4 = zmm12[1],mem[1],zmm12[3],mem[3],zmm12[5],mem[5],zmm12[7],mem[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm4[1],zmm10[1],zmm4[3],zmm10[3],zmm4[5],zmm10[5],zmm4[7],zmm10[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm4 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm1 = zmm4[1],mem[1],zmm4[3],mem[3],zmm4[5],mem[5],zmm4[7],mem[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm3 {%k1} # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm3 {%k1} = zmm22[0],mem[0],zmm22[2],mem[2],zmm22[4],mem[4],zmm22[6],mem[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm23[0],zmm18[0],zmm23[2],zmm18[2],zmm23[4],zmm18[4],zmm23[6],zmm18[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm20[0],zmm14[0],zmm20[2],zmm14[2],zmm20[4],zmm14[4],zmm20[6],zmm14[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm26[0],zmm25[0],zmm26[2],zmm25[2],zmm26[4],zmm25[4],zmm26[6],zmm25[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm24[0],zmm29[0],zmm24[2],zmm29[2],zmm24[4],zmm29[4],zmm24[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 {%k1} = zmm22[0],zmm19[0],zmm22[2],zmm19[2],zmm22[4],zmm19[4],zmm22[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm21[0],zmm17[0],zmm21[2],zmm17[2],zmm21[4],zmm17[4],zmm21[6],zmm17[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm5, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm31, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm30, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm1[0],zmm23[0],zmm1[2],zmm23[2],zmm1[4],zmm23[4],zmm1[6],zmm23[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm4[1],zmm1[1],zmm4[3],zmm1[3],zmm4[5],zmm1[5],zmm4[7],zmm1[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm3, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm2, %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm20[1],zmm14[1],zmm20[3],zmm14[3],zmm20[5],zmm14[5],zmm20[7],zmm14[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm3, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm3, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm16, %zmm12 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm12 {%k1} = zmm26[1],zmm25[1],zmm26[3],zmm25[3],zmm26[5],zmm25[5],zmm26[7],zmm25[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm3, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm3, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm24[1],zmm29[1],zmm24[3],zmm29[3],zmm24[5],zmm29[5],zmm24[7],zmm29[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm3, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm16, %zmm9 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 {%k1} = zmm22[1],zmm19[1],zmm22[3],zmm19[3],zmm22[5],zmm19[5],zmm22[7],zmm19[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm3, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm16, %zmm31 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm31 {%k1} = zmm21[1],zmm17[1],zmm21[3],zmm17[3],zmm21[5],zmm17[5],zmm21[7],zmm17[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm3, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm26, %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm16, %zmm15 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm24[1],zmm23[1],zmm24[3],zmm23[3],zmm24[5],zmm23[5],zmm24[7],zmm23[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm16, %zmm15 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm10[1],zmm8[1],zmm10[3],zmm8[3],zmm10[5],zmm8[5],zmm10[7],zmm8[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm7 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm8 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm15 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm9 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm1 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm2 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm4 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm11 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm12, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm16 = xmm0[0],xmm1[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm16, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 704(%rdi), %xmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 640(%rdi), %xmm20 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm20[0],xmm8[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 576(%rdi), %xmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 512(%rdi), %xmm16 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm16[0],xmm18[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm31, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 1216(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa 1152(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm1[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 1088(%rdi), %xmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa 1024(%rdi), %xmm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm14[0],xmm23[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm31, %ymm30, %ymm30 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm30, %zmm25, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 1728(%rdi), %xmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 1664(%rdi), %xmm31 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm31[0],xmm30[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 1600(%rdi), %xmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 1536(%rdi), %xmm11 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm11[0],xmm12[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm29 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 2240(%rdi), %xmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2176(%rdi), %xmm21 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm21[0],xmm19[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 2112(%rdi), %xmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa 2048(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm1[0],xmm15[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm25, %ymm28, %ymm25 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 2752(%rdi), %xmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 2688(%rdi), %xmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm29[0],xmm28[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 2624(%rdi), %xmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa 2560(%rdi), %xmm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm9[0],xmm10[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm25, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 3264(%rdi), %xmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3200(%rdi), %xmm27 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm27[0],xmm25[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 3136(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 3072(%rdi), %xmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm2[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm26, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 3776(%rdi), %xmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 3712(%rdi), %xmm26 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm26[0],xmm7[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 3648(%rdi), %xmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa 3584(%rdi), %xmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm24 = xmm5[0],xmm6[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm24, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm27[1],xmm25[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm20[1],xmm8[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm16[1],xmm18[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # xmm4 = xmm4[1],mem[1] +; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # xmm8 = xmm8[1],mem[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm8, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm31[1],xmm30[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm12[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm8, %ymm11, %ymm8 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # xmm11 = xmm11[1],mem[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm23[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm29[1],xmm28[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm9 = xmm9[1],xmm10[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm9, %ymm9 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm21[1],xmm19[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm1[1],xmm15[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm12, %ymm10 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm26[1],xmm7[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 448(%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 384(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 448(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 256(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 320(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 192(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 64(%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 384(%rdx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rcx) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%r8) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%r9) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%r9) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-FAST-NEXT: addq $6728, %rsp # imm = 0x1A48 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <512 x i64>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> %strided.vec1 = shufflevector <512 x i64> %wide.vec, <512 x i64> poison, <64 x i32> @@ -11928,16 +18774,8 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} -; AVX512BW-ONLY-FAST: {{.*}} -; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512BW-SLOW: {{.*}} -; AVX512DQ-FAST: {{.*}} -; AVX512DQ-SLOW: {{.*}} -; AVX512DQBW-FAST: {{.*}} -; AVX512DQBW-SLOW: {{.*}} ; AVX512F-FAST: {{.*}} -; AVX512F-ONLY-FAST: {{.*}} -; AVX512F-ONLY-SLOW: {{.*}} ; AVX512F-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 23f26672fe7d0..84f695245dbb1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -174,31 +174,47 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; SSE-NEXT: movdqa %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: load_i8_stride2_vf16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vmovdqa %xmm1, (%rdx) -; AVX1-NEXT: retq +; AVX1-ONLY-LABEL: load_i8_stride2_vf16: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rdx) +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i8_stride2_vf16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3 +; AVX2-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rdx) +; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i8_stride2_vf16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -450,9 +466,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm2, %ymm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3] @@ -460,9 +476,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] @@ -479,13 +495,13 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512F-SLOW-LABEL: load_i8_stride2_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm7 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0 @@ -493,9 +509,9 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] ; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 @@ -510,7 +526,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; ; AVX512F-FAST-LABEL: load_i8_stride2_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 @@ -518,17 +534,17 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm6 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15] ; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index 21b6e38f3f09a..f0118bc3b33b6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -680,7 +680,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -688,7 +689,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25] @@ -706,7 +708,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512F-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -733,7 +736,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -1369,7 +1373,8 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] +; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5 ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm5, %ymm5 @@ -1378,7 +1383,8 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u> +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0] +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8 ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm8, %ymm8 @@ -1393,7 +1399,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5] ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,0,1] @@ -1424,7 +1430,8 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512F-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 ; AVX512F-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpshufb %ymm6, %ymm1, %ymm1 @@ -1456,41 +1463,153 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512BW-LABEL: load_i8_stride3_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512BW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 -; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 -; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 -; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58] -; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] -; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58] -; AVX512BW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] -; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58] -; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57] -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-ONLY-SLOW-LABEL: load_i8_stride3_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-SLOW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: load_i8_stride3_vf64: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpshufb %zmm3, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-FAST-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800 +; AVX512BW-ONLY-FAST-NEXT: kmovq %rax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58] +; AVX512BW-ONLY-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: load_i8_stride3_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-SLOW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800 +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512DQBW-SLOW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-SLOW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: load_i8_stride3_vf64: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm5 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vpshufb %zmm3, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-FAST-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800 +; AVX512DQBW-FAST-NEXT: kmovq %rax, %k1 +; AVX512DQBW-FAST-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58] +; AVX512DQBW-FAST-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %wide.vec = load <192 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> %strided.vec1 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> @@ -1507,13 +1626,9 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW: {{.*}} ; AVX512: {{.*}} ; AVX512BW-FAST: {{.*}} -; AVX512BW-ONLY-FAST: {{.*}} -; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512BW-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} -; AVX512DQBW-FAST: {{.*}} -; AVX512DQBW-SLOW: {{.*}} ; AVX512F-FAST: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index 7e9cce1a7e8db..f42d49cbeb73d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -216,31 +216,57 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq %xmm2, (%r8) ; SSE-NEXT: retq ; -; AVX1-LABEL: load_i8_stride4_vf8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: vmovq %xmm3, (%rdx) -; AVX1-NEXT: vmovq %xmm4, (%rcx) -; AVX1-NEXT: vmovq %xmm1, (%r8) -; AVX1-NEXT: retq +; AVX1-ONLY-LABEL: load_i8_stride4_vf8: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-ONLY-NEXT: vmovq %xmm0, (%rsi) +; AVX1-ONLY-NEXT: vmovq %xmm3, (%rdx) +; AVX1-ONLY-NEXT: vmovq %xmm4, (%rcx) +; AVX1-ONLY-NEXT: vmovq %xmm1, (%r8) +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i8_stride4_vf8: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vmovq %xmm3, (%rdx) +; AVX2-ONLY-NEXT: vmovq %xmm4, (%rcx) +; AVX2-ONLY-NEXT: vmovq %xmm1, (%r8) +; AVX2-ONLY-NEXT: retq ; ; AVX512-LABEL: load_i8_stride4_vf8: ; AVX512: # %bb.0: @@ -433,38 +459,38 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -844,16 +870,16 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm7 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm1, %ymm9 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm6, %ymm9 @@ -861,48 +887,48 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm6, %ymm8 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm9 ; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm8 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm1, %ymm10 ; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm6, %ymm10 ; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm0, %ymm9 ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm6, %ymm9 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm10 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm9 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm1, %ymm11 ; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm6, %ymm11 ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm6, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm6, %ymm1 ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -918,7 +944,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-LABEL: load_i8_stride4_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm3 @@ -928,21 +954,21 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm5 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-NEXT: vpshufb %ymm5, %ymm1, %ymm6 ; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm5 ; AVX512F-NEXT: vpermt2d %ymm6, %ymm4, %ymm5 ; AVX512F-NEXT: vpsrld $8, %zmm2, %zmm6 ; AVX512F-NEXT: vpmovdb %zmm6, %xmm6 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-NEXT: vpshufb %ymm6, %ymm1, %ymm7 ; AVX512F-NEXT: vpshufb %ymm6, %ymm3, %ymm6 ; AVX512F-NEXT: vpermt2d %ymm7, %ymm4, %ymm6 ; AVX512F-NEXT: vpsrld $16, %zmm2, %zmm7 ; AVX512F-NEXT: vpmovdb %zmm7, %xmm7 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vpermt2d %ymm1, %ymm4, %ymm3 @@ -1745,16 +1771,16 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm8 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm9 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm1, %ymm9 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1788,7 +1814,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm13 ; AVX2-ONLY-NEXT: vmovdqa %xmm7, %xmm10 ; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1796,7 +1822,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa %xmm6, %xmm7 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm0 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, %xmm14 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1804,7 +1830,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm13 ; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -1829,16 +1855,16 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm13 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm13 ; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 @@ -1862,20 +1888,20 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm6 ; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm2, %ymm6 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -1916,7 +1942,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX512F-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4 @@ -1934,7 +1960,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpmovdb %zmm0, %xmm9 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-NEXT: vpshufb %ymm8, %ymm3, %ymm9 ; AVX512F-NEXT: vpshufb %ymm8, %ymm4, %ymm10 ; AVX512F-NEXT: vpermt2d %ymm9, %ymm1, %ymm10 @@ -1949,7 +1975,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpmovdb %zmm10, %xmm10 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm10 ; AVX512F-NEXT: vpshufb %ymm9, %ymm4, %ymm11 ; AVX512F-NEXT: vpermt2d %ymm10, %ymm1, %ymm11 @@ -1964,7 +1990,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpmovdb %zmm11, %xmm11 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[0,1,2,3],zmm10[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-NEXT: vpshufb %ymm10, %ymm3, %ymm3 ; AVX512F-NEXT: vpshufb %ymm10, %ymm4, %ymm4 ; AVX512F-NEXT: vpermt2d %ymm3, %ymm1, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index 55d943b52659d..5e0f383246c6f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -150,26 +150,47 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movd %xmm3, (%r9) ; SSE-NEXT: retq ; -; AVX-LABEL: load_i8_stride5_vf4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] -; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] -; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovd %xmm3, (%rsi) -; AVX-NEXT: vmovd %xmm4, (%rdx) -; AVX-NEXT: vmovd %xmm5, (%rcx) -; AVX-NEXT: vmovd %xmm6, (%r8) -; AVX-NEXT: vmovd %xmm0, (%r9) -; AVX-NEXT: retq +; AVX1-ONLY-LABEL: load_i8_stride5_vf4: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vmovd %xmm3, (%rsi) +; AVX1-ONLY-NEXT: vmovd %xmm4, (%rdx) +; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx) +; AVX1-ONLY-NEXT: vmovd %xmm6, (%r8) +; AVX1-ONLY-NEXT: vmovd %xmm0, (%r9) +; AVX1-ONLY-NEXT: retq +; +; AVX2-LABEL: load_i8_stride5_vf4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] +; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] +; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] +; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovd %xmm3, (%rsi) +; AVX2-NEXT: vmovd %xmm4, (%rdx) +; AVX2-NEXT: vmovd %xmm5, (%rcx) +; AVX2-NEXT: vmovd %xmm6, (%r8) +; AVX2-NEXT: vmovd %xmm0, (%r9) +; AVX2-NEXT: retq %wide.vec = load <20 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> %strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> @@ -1661,7 +1682,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255> ; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm7 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] +; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] @@ -1682,7 +1704,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255> ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm12 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm9, %ymm12, %ymm9 @@ -1700,7 +1723,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255> ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm13 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm10 @@ -1718,7 +1742,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm13 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] +; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u> @@ -1736,7 +1761,8 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u> +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] @@ -3576,7 +3602,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11] +; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm12, %ymm5 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0] ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm12 @@ -3596,7 +3623,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12] +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm13, %ymm8 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm5, %ymm8, %ymm13 ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm10, %ymm5 @@ -3614,7 +3642,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13] +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm14, %ymm14 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm14, %ymm0 @@ -3636,7 +3665,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14] +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm6, %ymm0 @@ -3661,13 +3691,13 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm5, %ymm3, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm5 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm4, %ymm15, %ymm14 ; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm11 ; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm15 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm2 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -3684,9 +3714,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7],ymm9[8,9,10,11,12],ymm4[13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm12 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm7 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 @@ -3699,9 +3729,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14] ; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm9 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm12 ; AVX2-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 @@ -3718,11 +3748,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm14 ; AVX2-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15] +; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15] ; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] ; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm0, %ymm10 @@ -3738,9 +3769,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm7, %ymm2 ; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13] ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128] ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 95800b30987ae..b15879a427b31 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -4226,7 +4226,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] ; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4244,7 +4244,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm3 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0> ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm11, %ymm12, %ymm2 @@ -4266,7 +4266,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] ; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4343,7 +4343,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm14 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm14, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] ; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm13, %ymm0 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -4374,7 +4374,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm5 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm13, %ymm11 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm11[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] @@ -4752,7 +4752,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm21, %xmm4 ; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10] ; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800 ; AVX512BW-NEXT: kmovd %r10d, %k2 ; AVX512BW-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2} @@ -4791,7 +4791,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %xmm7, %xmm20, %xmm7 ; AVX512BW-NEXT: vpshufb %xmm9, %xmm21, %xmm9 ; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2} ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} ; AVX512BW-NEXT: vpshufb %ymm7, %ymm22, %ymm7 @@ -4813,7 +4813,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm17, %xmm8, %xmm18 ; AVX512BW-NEXT: vporq %xmm12, %xmm18, %xmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12] ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800 ; AVX512BW-NEXT: kmovd %edi, %k5 ; AVX512BW-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5} @@ -4853,7 +4853,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm16, %xmm8, %xmm8 ; AVX512BW-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13] ; AVX512BW-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5} ; AVX512BW-NEXT: vpshufb %xmm13, %xmm21, %xmm13 ; AVX512BW-NEXT: vpshufb %xmm16, %xmm20, %xmm15 @@ -4873,7 +4873,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u> ; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1} @@ -4913,7 +4913,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm14, %xmm15, %xmm15 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 6a959b11bf142..d12a7d797c64e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -213,44 +213,161 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movd %xmm0, (%rax) ; SSE-NEXT: retq ; -; AVX-LABEL: load_i8_stride7_vf4: -; AVX: # %bb.0: -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = <3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] -; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm2, (%rsi) -; AVX-NEXT: vmovd %xmm3, (%rdx) -; AVX-NEXT: vmovd %xmm5, (%rcx) -; AVX-NEXT: vmovd %xmm7, (%r8) -; AVX-NEXT: vmovd %xmm4, (%r9) -; AVX-NEXT: vmovd %xmm6, (%r10) -; AVX-NEXT: vmovd %xmm0, (%rax) -; AVX-NEXT: retq +; AVX1-ONLY-LABEL: load_i8_stride7_vf4: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-ONLY-NEXT: vmovd %xmm2, (%rsi) +; AVX1-ONLY-NEXT: vmovd %xmm3, (%rdx) +; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx) +; AVX1-ONLY-NEXT: vmovd %xmm7, (%r8) +; AVX1-ONLY-NEXT: vmovd %xmm4, (%r9) +; AVX1-ONLY-NEXT: vmovd %xmm6, (%r10) +; AVX1-ONLY-NEXT: vmovd %xmm0, (%rax) +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i8_stride7_vf4: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-ONLY-NEXT: vmovd %xmm2, (%rsi) +; AVX2-ONLY-NEXT: vmovd %xmm3, (%rdx) +; AVX2-ONLY-NEXT: vmovd %xmm5, (%rcx) +; AVX2-ONLY-NEXT: vmovd %xmm7, (%r8) +; AVX2-ONLY-NEXT: vmovd %xmm4, (%r9) +; AVX2-ONLY-NEXT: vmovd %xmm6, (%r10) +; AVX2-ONLY-NEXT: vmovd %xmm0, (%rax) +; AVX2-ONLY-NEXT: retq +; +; AVX512F-LABEL: load_i8_stride7_vf4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX512F-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512F-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vmovd %xmm2, (%rsi) +; AVX512F-NEXT: vmovd %xmm3, (%rdx) +; AVX512F-NEXT: vmovd %xmm5, (%rcx) +; AVX512F-NEXT: vmovd %xmm7, (%r8) +; AVX512F-NEXT: vmovd %xmm4, (%r9) +; AVX512F-NEXT: vmovd %xmm6, (%r10) +; AVX512F-NEXT: vmovd %xmm0, (%rax) +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_i8_stride7_vf4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] +; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] +; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] +; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vmovd %xmm2, (%rsi) +; AVX512BW-NEXT: vmovd %xmm3, (%rdx) +; AVX512BW-NEXT: vmovd %xmm5, (%rcx) +; AVX512BW-NEXT: vmovd %xmm7, (%r8) +; AVX512BW-NEXT: vmovd %xmm4, (%r9) +; AVX512BW-NEXT: vmovd %xmm6, (%r10) +; AVX512BW-NEXT: vmovd %xmm0, (%rax) +; AVX512BW-NEXT: retq %wide.vec = load <28 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> %strided.vec1 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> @@ -3311,7 +3428,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm8, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,2,4,6,1,2,4,6] +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3339,7 +3457,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,3,4,6,1,3,4,6] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] @@ -3457,7 +3576,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u] ; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,3,5,6,1,3,5,6] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] @@ -3718,7 +3838,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 ; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 @@ -3923,7 +4043,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512F-FAST-NEXT: vpor %xmm4, %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,6,1,2,4,6] +; AVX512F-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm12 ; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -3956,7 +4077,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u] ; AVX512F-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,3,4,6,1,3,4,6] +; AVX512F-FAST-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] @@ -3991,7 +4113,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] ; AVX512F-FAST-NEXT: vpor %xmm6, %xmm13, %xmm6 ; AVX512F-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [1,3,5,6,1,3,5,6] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] @@ -4153,7 +4276,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13 ; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm15 ; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] @@ -4339,7 +4462,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u] ; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,2,4,6,1,2,4,6] +; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -4366,7 +4490,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u] ; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm11, %xmm11 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,4,6,1,3,4,6] +; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7] @@ -4385,7 +4510,8 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u] ; AVX512BW-FAST-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX512BW-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,3,5,6,1,3,5,6] +; AVX512BW-FAST-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] @@ -7377,7 +7503,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 @@ -7413,7 +7540,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 @@ -7442,11 +7570,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] ; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7484,9 +7612,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] ; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm8 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -7518,10 +7646,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm12 ; AVX2-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 @@ -7553,9 +7681,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm13 ; AVX2-SLOW-NEXT: vpor %xmm9, %xmm13, %xmm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -7585,10 +7713,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] ; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm9 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 @@ -7624,9 +7752,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm13 ; AVX2-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -7699,7 +7827,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm4 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm0 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 @@ -7721,7 +7850,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 @@ -7740,7 +7870,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm3 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] +; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1 @@ -7761,7 +7892,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -7784,7 +7916,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -7804,10 +7937,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] @@ -7912,7 +8045,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 @@ -7946,7 +8080,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm13 @@ -7978,7 +8113,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13] ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] @@ -8011,7 +8146,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [1,3,0,2,1,3,4,6] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm9, %ymm11 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 @@ -8037,11 +8172,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] ; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm15 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm12 ; AVX2-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 @@ -8077,10 +8212,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] ; AVX2-FAST-NEXT: vmovdqa %xmm14, %xmm13 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm15 @@ -8112,9 +8247,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm10 ; AVX2-FAST-NEXT: vpor %xmm14, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -8145,10 +8280,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm11 ; AVX2-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -8221,7 +8356,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7,8,9,10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm3 @@ -8241,7 +8377,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2],ymm9[3],ymm2[4,5,6],ymm9[7,8],ymm2[9,10],ymm9[11],ymm2[12,13,14],ymm9[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm9 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 @@ -8260,7 +8397,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6],ymm1[7,8],ymm6[9,10,11],ymm1[12],ymm6[13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] +; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 @@ -8281,7 +8419,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -8304,7 +8443,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7,8],ymm7[9],ymm11[10,11,12],ymm7[13],ymm11[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] +; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -8327,7 +8467,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [1,3,1,2,1,3,5,6] ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15] ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3,4,5,6,7],ymm3[8],ymm7[9,10,11,12,13,14,15] @@ -8422,7 +8562,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 @@ -8458,7 +8599,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 @@ -8487,11 +8629,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8529,9 +8671,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm3, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 @@ -8563,10 +8705,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm12, %xmm8 @@ -8598,9 +8740,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm13, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -8630,10 +8772,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm13 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm14, %xmm12 @@ -8669,9 +8811,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm13, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 @@ -8744,7 +8886,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11] +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 @@ -8766,7 +8909,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12] +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 @@ -8785,7 +8929,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13] +; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm1 @@ -8806,7 +8951,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14] +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -8829,7 +8975,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15] +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -8849,10 +8996,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] @@ -8959,7 +9106,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %xmm0, %xmm28 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm7 @@ -9258,7 +9405,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] ; AVX512F-ONLY-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 ; AVX512F-ONLY-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15] -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX512F-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm5 ; AVX512F-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm0 @@ -9370,7 +9517,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -9436,7 +9584,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] @@ -9488,7 +9637,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] @@ -9663,7 +9813,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u] ; AVX512F-ONLY-FAST-NEXT: vpor %xmm7, %xmm0, %xmm0 ; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7,8],ymm11[9],ymm12[10,11,12],ymm11[13],ymm12[14,15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm3, %ymm0, %ymm7 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -10071,7 +10221,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] ; AVX512DQ-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm1[1],ymm9[2,3,4],ymm1[5],ymm9[6,7,8],ymm1[9],ymm9[10,11,12],ymm1[13],ymm9[14,15] -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-SLOW-NEXT: vpternlogq $248, %ymm18, %ymm0, %ymm3 ; AVX512DQ-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm0 @@ -10180,7 +10330,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,2,4,6,1,2,4,6] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm6 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -10246,7 +10397,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,4,6,1,3,4,6] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm5 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] @@ -10299,7 +10451,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm2, %xmm2 ; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,3,5,6,1,3,5,6] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] @@ -10471,7 +10624,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u] ; AVX512DQ-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15] -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] ; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm21, %ymm4, %ymm5 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -10586,7 +10739,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] @@ -10866,7 +11019,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm1, %xmm9, %xmm1 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9 ; AVX512BW-ONLY-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] @@ -10966,7 +11119,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -11020,7 +11174,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 ; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] +; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] @@ -11043,7 +11198,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15 ; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] +; AVX512BW-ONLY-FAST-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] @@ -11219,7 +11375,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vporq %xmm21, %xmm0, %xmm0 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-ONLY-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX512BW-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21 ; AVX512BW-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] @@ -11321,7 +11477,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm7 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = +; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm7, %xmm4 ; AVX512DQBW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm9 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u] @@ -11601,7 +11757,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQBW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX512DQBW-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm9 ; AVX512DQBW-SLOW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] @@ -11698,7 +11854,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,6,1,2,4,6] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm19 ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm5, %ymm5 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] @@ -11751,7 +11908,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm23, %xmm20, %xmm20 ; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,4,6,1,3,4,6] +; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm20 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] @@ -11774,7 +11932,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm15, %xmm15 ; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [1,3,5,6,1,3,5,6] +; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermd %ymm19, %ymm20, %ymm19 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] ; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7] @@ -11949,7 +12108,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vporq %xmm20, %xmm21, %xmm20 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} ymm20 {%k5} = ymm1[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512DQBW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11] ; AVX512DQBW-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm21 ; AVX512DQBW-FAST-NEXT: vpshufb {{.*#+}} xmm22 = xmm5[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQBW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm21[0],xmm22[0],xmm21[1],xmm22[1],xmm21[2],xmm22[2],xmm21[3],xmm22[3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 9cbb3fea50c7d..657c353b82335 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -235,54 +235,103 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movd %xmm1, (%rax) ; SSE-NEXT: retq ; -; AVX1-LABEL: load_i8_stride8_vf4: -; AVX1: # %bb.0: -; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm7 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: vmovd %xmm3, (%rdx) -; AVX1-NEXT: vmovd %xmm4, (%rcx) -; AVX1-NEXT: vmovd %xmm5, (%r8) -; AVX1-NEXT: vmovd %xmm6, (%r9) -; AVX1-NEXT: vmovd %xmm7, (%r11) -; AVX1-NEXT: vmovd %xmm8, (%r10) -; AVX1-NEXT: vmovd %xmm1, (%rax) -; AVX1-NEXT: retq +; AVX1-ONLY-LABEL: load_i8_stride8_vf4: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm6 +; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX1-ONLY-NEXT: vmovd %xmm0, (%rsi) +; AVX1-ONLY-NEXT: vmovd %xmm3, (%rdx) +; AVX1-ONLY-NEXT: vmovd %xmm4, (%rcx) +; AVX1-ONLY-NEXT: vmovd %xmm5, (%r8) +; AVX1-ONLY-NEXT: vmovd %xmm6, (%r9) +; AVX1-ONLY-NEXT: vmovd %xmm7, (%r11) +; AVX1-ONLY-NEXT: vmovd %xmm8, (%r10) +; AVX1-ONLY-NEXT: vmovd %xmm1, (%rax) +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i8_stride8_vf4: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm6 +; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm7 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vmovd %xmm3, (%rdx) +; AVX2-ONLY-NEXT: vmovd %xmm4, (%rcx) +; AVX2-ONLY-NEXT: vmovd %xmm5, (%r8) +; AVX2-ONLY-NEXT: vmovd %xmm6, (%r9) +; AVX2-ONLY-NEXT: vmovd %xmm7, (%r11) +; AVX2-ONLY-NEXT: vmovd %xmm8, (%r10) +; AVX2-ONLY-NEXT: vmovd %xmm1, (%rax) +; AVX2-ONLY-NEXT: retq ; ; AVX512-LABEL: load_i8_stride8_vf4: ; AVX512: # %bb.0: @@ -638,74 +687,74 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9 ; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm9 ; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm10 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm11 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1403,13 +1452,13 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-LABEL: load_i8_stride8_vf16: ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovdqa 112(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm6 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 @@ -1419,147 +1468,147 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm10 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm9 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm8, %xmm10 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm9 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm11 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm10 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm11 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm10 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm12 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm12 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm11 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm13 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, %xmm9 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm8, %xmm12 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm11 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm13 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14 ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm13 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm12 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm14 ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm13 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm14 ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm13 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm15 ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm8, %xmm14 ; AVX2-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm13 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15 ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm0 ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm15 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm14 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm15 ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1,2],xmm0[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm0 ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm15 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm4 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX2-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] @@ -1580,121 +1629,237 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rax) ; AVX2-ONLY-NEXT: retq ; -; AVX512-LABEL: load_i8_stride8_vf16: -; AVX512: # %bb.0: -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm6 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512-NEXT: vpmovqb %zmm5, %xmm6 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm7 -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm8 -; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm7 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512-NEXT: vpsrlq $8, %zmm5, %zmm7 -; AVX512-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512-NEXT: vpshufb %xmm8, %xmm2, %xmm9 -; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512-NEXT: vpsrlq $16, %zmm5, %zmm8 -; AVX512-NEXT: vpmovqb %zmm8, %xmm8 -; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm9 -; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm10 -; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] -; AVX512-NEXT: vpsrlq $24, %zmm5, %zmm9 -; AVX512-NEXT: vpmovqb %zmm9, %xmm9 -; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm10 -; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm11 -; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm10 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512-NEXT: vpsrlq $32, %zmm5, %zmm10 -; AVX512-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm11 -; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm10 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX512-NEXT: vpshufb %xmm11, %xmm2, %xmm12 -; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX512-NEXT: vpsrlq $40, %zmm5, %zmm11 -; AVX512-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm12 -; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm13 -; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] -; AVX512-NEXT: vpsrlq $48, %zmm5, %zmm12 -; AVX512-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512-NEXT: vpsrlq $56, %zmm5, %zmm1 -; AVX512-NEXT: vpmovqb %zmm1, %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512-NEXT: vmovdqa %xmm4, (%rsi) -; AVX512-NEXT: vmovdqa %xmm6, (%rdx) -; AVX512-NEXT: vmovdqa %xmm7, (%rcx) -; AVX512-NEXT: vmovdqa %xmm8, (%r8) -; AVX512-NEXT: vmovdqa %xmm9, (%r9) -; AVX512-NEXT: vmovdqa %xmm10, (%r11) -; AVX512-NEXT: vmovdqa %xmm11, (%r10) -; AVX512-NEXT: vmovdqa %xmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: load_i8_stride8_vf16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm6 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512F-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-NEXT: vpmovqb %zmm5, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX512F-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX512F-NEXT: vpshufb %xmm7, %xmm3, %xmm7 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] +; AVX512F-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512F-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-NEXT: vpshufb %xmm7, %xmm0, %xmm8 +; AVX512F-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512F-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX512F-NEXT: vpshufb %xmm8, %xmm3, %xmm8 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] +; AVX512F-NEXT: vpsrlq $16, %zmm5, %zmm8 +; AVX512F-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm8 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm10 +; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512F-NEXT: vpsrlq $24, %zmm5, %zmm9 +; AVX512F-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-NEXT: vpshufb %xmm9, %xmm0, %xmm10 +; AVX512F-NEXT: vpshufb %xmm9, %xmm1, %xmm9 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512F-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX512F-NEXT: vpshufb %xmm10, %xmm3, %xmm10 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX512F-NEXT: vpsrlq $32, %zmm5, %zmm10 +; AVX512F-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512F-NEXT: vpshufb %xmm10, %xmm0, %xmm11 +; AVX512F-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-NEXT: vpshufb %xmm11, %xmm2, %xmm12 +; AVX512F-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX512F-NEXT: vpsrlq $40, %zmm5, %zmm11 +; AVX512F-NEXT: vpmovqb %zmm11, %xmm11 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512F-NEXT: vpshufb %xmm11, %xmm0, %xmm12 +; AVX512F-NEXT: vpshufb %xmm11, %xmm1, %xmm11 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512F-NEXT: vpshufb %xmm12, %xmm2, %xmm13 +; AVX512F-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] +; AVX512F-NEXT: vpsrlq $48, %zmm5, %zmm12 +; AVX512F-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512F-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512F-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512F-NEXT: vpsrlq $56, %zmm5, %zmm1 +; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-NEXT: vmovdqa %xmm4, (%rsi) +; AVX512F-NEXT: vmovdqa %xmm6, (%rdx) +; AVX512F-NEXT: vmovdqa %xmm7, (%rcx) +; AVX512F-NEXT: vmovdqa %xmm8, (%r8) +; AVX512F-NEXT: vmovdqa %xmm9, (%r9) +; AVX512F-NEXT: vmovdqa %xmm10, (%r11) +; AVX512F-NEXT: vmovdqa %xmm11, (%r10) +; AVX512F-NEXT: vmovdqa %xmm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_i8_stride8_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm3 +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] +; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm6 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-NEXT: vpmovqb %zmm5, %xmm6 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm3, %xmm7 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] +; AVX512BW-NEXT: vpsrlq $8, %zmm5, %zmm7 +; AVX512BW-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vpshufb %xmm7, %xmm0, %xmm8 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] +; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm9 +; AVX512BW-NEXT: vpshufb %xmm8, %xmm3, %xmm8 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] +; AVX512BW-NEXT: vpsrlq $16, %zmm5, %zmm8 +; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9 +; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm8 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-NEXT: vpshufb %xmm9, %xmm2, %xmm10 +; AVX512BW-NEXT: vpshufb %xmm9, %xmm3, %xmm9 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3] +; AVX512BW-NEXT: vpsrlq $24, %zmm5, %zmm9 +; AVX512BW-NEXT: vpmovqb %zmm9, %xmm9 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vpshufb %xmm9, %xmm0, %xmm10 +; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm9 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] +; AVX512BW-NEXT: vpshufb %xmm10, %xmm2, %xmm11 +; AVX512BW-NEXT: vpshufb %xmm10, %xmm3, %xmm10 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX512BW-NEXT: vpsrlq $32, %zmm5, %zmm10 +; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vpshufb %xmm10, %xmm0, %xmm11 +; AVX512BW-NEXT: vpshufb %xmm10, %xmm1, %xmm10 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] +; AVX512BW-NEXT: vpshufb %xmm11, %xmm2, %xmm12 +; AVX512BW-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] +; AVX512BW-NEXT: vpsrlq $40, %zmm5, %zmm11 +; AVX512BW-NEXT: vpmovqb %zmm11, %xmm11 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vpshufb %xmm11, %xmm0, %xmm12 +; AVX512BW-NEXT: vpshufb %xmm11, %xmm1, %xmm11 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] +; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm13 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3] +; AVX512BW-NEXT: vpsrlq $48, %zmm5, %zmm12 +; AVX512BW-NEXT: vpmovqb %zmm12, %xmm12 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] +; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512BW-NEXT: vpsrlq $56, %zmm5, %zmm1 +; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi) +; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx) +; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx) +; AVX512BW-NEXT: vmovdqa %xmm8, (%r8) +; AVX512BW-NEXT: vmovdqa %xmm9, (%r9) +; AVX512BW-NEXT: vmovdqa %xmm10, (%r11) +; AVX512BW-NEXT: vmovdqa %xmm11, (%r10) +; AVX512BW-NEXT: vmovdqa %xmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %wide.vec = load <128 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <16 x i32> %strided.vec1 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <16 x i32> @@ -3099,7 +3264,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-SLOW-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 @@ -3109,7 +3274,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm6 @@ -3124,11 +3289,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm9 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm11 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] @@ -3172,23 +3337,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload @@ -3224,23 +3389,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm11, %xmm15 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm7 @@ -3274,24 +3439,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -3323,24 +3488,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -3374,22 +3539,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -3424,25 +3589,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -3474,25 +3639,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -3552,7 +3717,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 @@ -3560,7 +3725,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 @@ -3572,11 +3737,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm9 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm11 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9 @@ -3601,23 +3766,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 @@ -3633,20 +3798,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 @@ -3663,26 +3828,26 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm5 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm2 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm11 ; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm8 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm10, %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] @@ -3697,23 +3862,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm6 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm15 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm13, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -3735,22 +3900,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm10, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -3765,21 +3930,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm2 ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] @@ -3793,20 +3958,20 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm4 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm6 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] @@ -3845,7 +4010,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $360, %rsp # imm = 0x168 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm10 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm1 @@ -3855,7 +4020,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm1, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm5, %xmm6 @@ -3870,11 +4035,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] @@ -3918,23 +4083,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm4, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm7, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload @@ -3970,23 +4135,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm7 @@ -4020,24 +4185,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm15, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -4069,24 +4234,24 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -4120,22 +4285,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -4170,25 +4335,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -4220,25 +4385,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -4295,7 +4460,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 @@ -4304,7 +4469,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 @@ -4339,21 +4504,21 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm10 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 @@ -4361,7 +4526,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 @@ -4386,11 +4551,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 @@ -4398,13 +4563,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm14 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] @@ -4428,13 +4593,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 @@ -4442,13 +4607,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm26 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] @@ -4472,25 +4637,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm28 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 @@ -4515,13 +4680,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm25 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 @@ -4530,13 +4695,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 @@ -4561,13 +4726,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 @@ -4576,12 +4741,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 @@ -4604,11 +4769,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -4616,12 +4781,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] @@ -4658,7 +4823,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-LABEL: load_i8_stride8_vf32: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm18 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm16 ; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm0, %ymm4 @@ -4675,14 +4840,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm5 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm12 ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 @@ -4696,19 +4861,19 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm7 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10 ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] @@ -4718,27 +4883,27 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm7 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm7 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10 ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] @@ -4748,25 +4913,25 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm4 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm6 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm6 ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm9 ; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] @@ -4788,12 +4953,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm6 ; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm15 ; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm14 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm15 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] @@ -4811,12 +4976,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm14 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm7 ; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] @@ -4836,11 +5001,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm14 ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm14 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] @@ -4859,11 +5024,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm8 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] @@ -4895,14 +5060,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa 224(%rdi), %xmm6 ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vmovdqa 208(%rdi), %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm3 ; AVX512BW-SLOW-NEXT: vmovdqa 192(%rdi), %xmm9 ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm8 @@ -4934,23 +5099,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vmovdqa 144(%rdi), %xmm11 ; AVX512BW-SLOW-NEXT: vmovdqa 160(%rdi), %xmm12 ; AVX512BW-SLOW-NEXT: vmovdqa 176(%rdi), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -4969,23 +5134,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] @@ -5004,23 +5169,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm11, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm19, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] @@ -5039,23 +5204,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -5074,23 +5239,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm7, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -5109,23 +5274,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm10 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm9, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm13, %xmm17 ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm15 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm17 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm11, %xmm18 ; AVX512BW-SLOW-NEXT: vpshufb %xmm17, %xmm19, %xmm17 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3] @@ -5143,23 +5308,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm9 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm13, %xmm9 ; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm7 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm11, %xmm10 ; AVX512BW-SLOW-NEXT: vpshufb %xmm9, %xmm19, %xmm9 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] @@ -5194,33 +5359,33 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512BW-FAST-NEXT: vmovdqa 224(%rdi), %ymm9 ; AVX512BW-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm4 ; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX512BW-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm26 ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm26, %ymm3 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512BW-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm27 ; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm27, %ymm3 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 128(%rdi), %ymm15 ; AVX512BW-FAST-NEXT: vpermd %ymm15, %ymm1, %ymm28 ; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm28, %ymm1 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512BW-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm5 ; AVX512BW-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512BW-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm16 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm16 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-FAST-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512BW-FAST-NEXT: vpshufb %xmm16, %xmm3, %xmm17 ; AVX512BW-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 @@ -5231,22 +5396,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm16 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm16, %ymm4, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm26, %ymm7 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm18, %ymm27, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm19, %ymm28, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm20 ; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3] @@ -5256,22 +5421,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm20, %ymm4, %ymm7 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm21, %ymm26, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm22, %ymm27, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm28, %ymm10 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm10 ; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm24 ; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3] @@ -5280,22 +5445,22 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm24 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm24, %ymm4, %ymm4 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm25 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm25 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm25, %ymm26, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm26 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm26, %ymm27, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm28, %ymm10 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5],ymm10[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm10 ; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm28 ; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3] @@ -5316,11 +5481,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %ymm14, %ymm15, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm2, %xmm12 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] @@ -5336,11 +5501,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %ymm19, %ymm15, %ymm14 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm16 ; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] @@ -5356,11 +5521,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %ymm23, %ymm15, %ymm6 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm16 ; AVX512BW-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm16[0],xmm14[1],xmm16[1],xmm14[2],xmm16[2],xmm14[3],xmm16[3] @@ -5376,11 +5541,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %ymm27, %ymm15, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5],ymm11[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX512BW-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] @@ -8168,7 +8333,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: subq $840, %rsp # imm = 0x348 ; AVX2-SLOW-NEXT: vmovdqa 368(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm12 @@ -8176,7 +8341,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, %xmm15 ; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8187,14 +8352,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 @@ -8309,22 +8474,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm15 @@ -8410,25 +8575,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm14 @@ -8511,25 +8676,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -8613,24 +8778,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -8716,26 +8881,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm14 @@ -8817,26 +8982,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -8921,25 +9086,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload @@ -9063,14 +9228,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: subq $904, %rsp # imm = 0x388 ; AVX2-FAST-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm14 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9081,7 +9246,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9089,7 +9254,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vmovdqa 272(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9107,7 +9272,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3,4,5,6],ymm4[7] @@ -9117,12 +9282,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm6, %ymm11 ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] @@ -9184,25 +9349,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm14, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -9258,24 +9423,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm13, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm9 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vmovdqa %xmm11, %xmm15 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14 @@ -9328,25 +9493,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm10 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm14 @@ -9355,12 +9520,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] @@ -9368,7 +9533,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] @@ -9400,26 +9565,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 @@ -9431,7 +9596,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3,4,5,6],ymm3[7] @@ -9440,7 +9605,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] @@ -9461,7 +9626,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm8, %xmm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm14, %xmm7 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 @@ -9485,26 +9650,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm14 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -9557,25 +9722,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm10 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm14 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -9625,43 +9790,43 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm10 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm11 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm13 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm14 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastw {{.*#+}} xmm14 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm15 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm8 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] @@ -9739,7 +9904,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: subq $840, %rsp # imm = 0x348 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 368(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm9 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm12 @@ -9747,7 +9912,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9758,14 +9923,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm0 @@ -9880,22 +10045,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm13, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm12, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm15, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm15 @@ -9981,25 +10146,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm9, %xmm14 @@ -10082,25 +10247,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -10184,24 +10349,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm12, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm13, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -10287,26 +10452,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm7, %xmm14 @@ -10388,26 +10553,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm11, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -10492,25 +10657,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw {{.*#+}} xmm0 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm10, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload @@ -10636,7 +10801,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 @@ -10644,7 +10809,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm14 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm5 @@ -10725,12 +10890,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -10739,7 +10904,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 @@ -10748,7 +10913,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm10 @@ -10825,13 +10990,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm6 @@ -10840,13 +11005,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -10916,14 +11081,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 @@ -10933,11 +11098,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm6 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -11004,13 +11169,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 @@ -11018,13 +11183,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm24 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -11095,13 +11260,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -11110,13 +11275,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm8 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 @@ -11184,13 +11349,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm23 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm21 @@ -11200,13 +11365,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -11273,13 +11438,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -11288,13 +11453,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 @@ -11379,14 +11544,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-LABEL: load_i8_stride8_vf64: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: subq $408, %rsp # imm = 0x198 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm11 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm4 @@ -11394,14 +11559,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 ; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm10 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 ; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %ymm31 ; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm1, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm5 @@ -11409,14 +11574,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] ; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 ; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm13 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] ; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm7 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm27 @@ -11474,31 +11639,31 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm10 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm3 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12 ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm5 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm21 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm13 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 @@ -11542,31 +11707,31 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm24 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 ; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm8 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12 ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm5 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm13 @@ -11610,31 +11775,31 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm2 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm22 ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 ; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm12 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm5 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 @@ -11675,32 +11840,32 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm4 ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm6 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm0, %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm8 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm8 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm18 ; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm10 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm24 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm10 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm22 ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm12 @@ -11746,29 +11911,29 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm6 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm11 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm10 ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 ; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 @@ -11809,30 +11974,30 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm31 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm11 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm11 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm12 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm16 ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm28 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm24 ; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm15 @@ -11871,26 +12036,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm17 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm11 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm12 @@ -11954,7 +12119,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vmovdqa 496(%rdi), %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-SLOW-NEXT: vpshufb %xmm15, %xmm5, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, %xmm7 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -11964,7 +12129,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vmovdqa 464(%rdi), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm6, %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 ; AVX512BW-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 @@ -12052,28 +12217,28 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12 ; AVX512BW-SLOW-NEXT: vmovdqa 416(%rdi), %xmm13 ; AVX512BW-SLOW-NEXT: vmovdqa64 432(%rdi), %xmm16 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm25, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm24, %xmm19 ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm24, %xmm24 ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm26, %xmm25 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm27 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm16, %xmm24 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm16, %xmm22 ; AVX512BW-SLOW-NEXT: vpshufb %xmm27, %xmm13, %xmm25 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm13, %xmm18 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm30 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm12, %xmm24 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm12, %xmm28 ; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm4, %xmm25 @@ -12144,12 +12309,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 ; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm26, %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm26, %xmm8 @@ -12158,13 +12323,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, %xmm23 ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm18, %xmm30 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm22 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm30 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-SLOW-NEXT: vpshufb %xmm30, %xmm28, %xmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, %xmm10 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -12232,26 +12397,26 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm4 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm10, %xmm30 ; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3] @@ -12314,13 +12479,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm14, %xmm8 ; AVX512BW-SLOW-NEXT: vmovdqa %xmm15, %xmm14 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -12329,11 +12494,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm23, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm22, %xmm26 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm26 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm31, %xmm30 ; AVX512BW-SLOW-NEXT: vpshufb %xmm26, %xmm18, %xmm3 @@ -12390,11 +12555,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm1 ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm3 ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm15, %xmm19 @@ -12402,13 +12567,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm31, %xmm26 ; AVX512BW-SLOW-NEXT: vpshufb %xmm5, %xmm18, %xmm30 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm26 = xmm30[0],xmm26[0],xmm30[1],xmm26[1],xmm30[2],xmm26[2],xmm30[3],xmm26[3] @@ -12465,13 +12630,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm3 ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm19, %xmm5 @@ -12479,11 +12644,11 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm26 ; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm18, %xmm30 ; AVX512BW-SLOW-NEXT: vmovdqa64 %xmm18, %xmm31 @@ -12538,23 +12703,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm23, %xmm4 ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm19, %xmm5 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm5 ; AVX512BW-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm6 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 ; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm31, %xmm9 @@ -12631,40 +12796,40 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512BW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm4 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512BW-FAST-NEXT: vmovdqa 480(%rdi), %ymm1 ; AVX512BW-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm30 ; AVX512BW-FAST-NEXT: vpshufb %ymm2, %ymm30, %ymm1 ; AVX512BW-FAST-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm31 ; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm31, %ymm2 ; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm17 ; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm17, %ymm2 ; AVX512BW-FAST-NEXT: vmovdqa %ymm3, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vmovdqa64 384(%rdi), %ymm28 ; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm0, %ymm14 ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm3 ; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm10 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm7 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-FAST-NEXT: vmovdqa64 368(%rdi), %xmm20 ; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm20, %xmm2 ; AVX512BW-FAST-NEXT: vmovdqa64 352(%rdi), %xmm19 ; AVX512BW-FAST-NEXT: vpshufb %xmm7, %xmm19, %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm24 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm24 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-FAST-NEXT: vmovdqa64 336(%rdi), %xmm18 ; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm18, %xmm5 ; AVX512BW-FAST-NEXT: vmovdqa64 320(%rdi), %xmm29 @@ -12712,25 +12877,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm30, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm31, %ymm13 ; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm7 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm17, %ymm13 ; AVX512BW-FAST-NEXT: vmovdqa %ymm8, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm14, %ymm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm20, %xmm15 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm19, %xmm24 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm24[0],xmm15[0],xmm24[1],xmm15[1],xmm24[2],xmm15[2],xmm24[3],xmm15[3] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm24 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm24 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9] ; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm18, %xmm26 ; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm29, %xmm21 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm21[0],xmm26[0],xmm21[1],xmm26[1],xmm21[2],xmm26[2],xmm21[3],xmm26[3] @@ -12760,25 +12925,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm11 ; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm7 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm17, %ymm11 ; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12 ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10] ; AVX512BW-FAST-NEXT: vmovdqa64 %xmm18, %xmm26 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm18, %xmm15 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm21 @@ -12809,24 +12974,24 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm30, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm31, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm31 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm17, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm30 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm20, %xmm11 ; AVX512BW-FAST-NEXT: vmovdqa64 %xmm19, %xmm24 ; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm19, %xmm12 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm18, %xmm14 ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm29, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] @@ -12860,25 +13025,25 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vpermd (%rsp), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vpshufb %ymm13, %ymm7, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm5, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm2[7] ; AVX512BW-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload ; AVX512BW-FAST-NEXT: vpermd %ymm28, %ymm3, %ymm14 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm20, %xmm12 ; AVX512BW-FAST-NEXT: vmovdqa64 %xmm20, %xmm16 ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm19, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm21 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12] ; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm18, %xmm15 ; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm29, %xmm28 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] @@ -12911,22 +13076,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm12, %zmm28 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm8 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm11 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm14, %ymm12 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm16, %xmm12 ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm24, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13] ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm26, %xmm21 ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm27 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm27[0],xmm21[0],xmm27[1],xmm21[1],xmm27[2],xmm21[2],xmm27[3],xmm21[3] @@ -12956,22 +13121,22 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm3, %zmm21 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm3 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm17, %ymm5, %ymm4 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5],ymm8[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm16, %xmm8 ; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm24, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14] ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm26, %xmm12 ; AVX512BW-FAST-NEXT: vpshufb %xmm11, %xmm29, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] @@ -13000,20 +13165,20 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512BW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm8, %zmm3 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm1 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512BW-FAST-NEXT: vpshufb %ymm31, %ymm2, %ymm1 ; AVX512BW-FAST-NEXT: vpshufb %ymm30, %ymm14, %ymm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm16, %xmm2 ; AVX512BW-FAST-NEXT: vpshufb %xmm1, %xmm24, %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512BW-FAST-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15] ; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm26, %xmm8 ; AVX512BW-FAST-NEXT: vpshufb %xmm4, %xmm29, %xmm11 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] @@ -13082,14 +13247,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX2: {{.*}} -; AVX512BW: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} -; AVX512F: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index eb0ef5caaa0a1..8f160e2bafda0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -549,7 +549,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -593,7 +593,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 16(%rdi), %xmm7 @@ -1043,7 +1043,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm4 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm6 @@ -1120,7 +1120,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 @@ -2048,7 +2048,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] @@ -2191,7 +2191,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 5ef699f087c32..92acf21cad010 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -2354,7 +2354,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm9, %ymm4, %ymm9 ; AVX2-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm4 ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm1, %ymm13, %ymm4 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5],ymm15[6],ymm4[7,8],ymm15[9],ymm4[10,11],ymm15[12],ymm4[13],ymm15[14],ymm4[15] @@ -2495,13 +2496,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq 32(%r8), %ymm8 ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm10, %ymm13 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] @@ -2526,11 +2528,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12 ; AVX2-FAST-NEXT: vpbroadcastq 24(%r8), %ymm5 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] @@ -2666,13 +2670,14 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 32(%r8), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm4, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm10, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] @@ -2697,11 +2702,13 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm13, %ymm5, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 24(%r8), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm5, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[1,1,1,2,5,5,5,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3],ymm14[4],ymm5[5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10,11],ymm14[12],ymm5[13],ymm14[14],ymm5[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm7[0,1,2,1,4,5,6,5] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3],ymm6[4],ymm0[5,6],ymm6[7],ymm0[8,9],ymm6[10],ymm0[11],ymm6[12],ymm0[13,14],ymm6[15] @@ -2877,7 +2884,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm7, %ymm6, %ymm13 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,1,1,2,5,5,5,6] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15] @@ -2984,16 +2992,18 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] ; AVX512F-FAST-NEXT: vprolq $16, %ymm11, %ymm1 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm0 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,3,2,3,6,7,6,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm23 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[1,1,1,2,5,5,5,6] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm13 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[3,2,3,3,7,6,7,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15] @@ -4715,7 +4725,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,1,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,1,2,5,5,5,6] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] @@ -5093,13 +5104,15 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] @@ -5175,11 +5188,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpbroadcastq 112(%r8), %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm15 ; AVX2-FAST-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -5465,13 +5479,15 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm10[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,2,5,5,5,6] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0,22,23,18,19,0,0,20,21,0,0,24,25,24,25,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm12, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,1,4,5,6,5] @@ -5547,11 +5563,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 112(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm12, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7,8],ymm7[9],ymm1[10],ymm7[11],ymm1[12,13],ymm7[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm7 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -5871,7 +5888,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm21[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-SLOW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm9, %ymm7 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm26[1,1,1,2,5,5,5,6] @@ -6158,17 +6176,19 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vprolq $16, %ymm8, %ymm3 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512F-FAST-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm1 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[2,3,2,3,6,7,6,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512F-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm1 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,1,1,2,5,5,5,6] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] ; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm6 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,2,3,3,7,6,7,7] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 71505f5912548..c20981d0d9398 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -1354,7 +1354,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <1,u,u,2,u,u,3,u> +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,0,3,2,1,0,3,2] +; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm14, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[1,1,1,1] @@ -1367,7 +1368,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <5,u,u,6,u,u,7,u> +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [5,0,7,6,5,0,7,6] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} ymm15 = ymm13[4],ymm1[4],ymm13[5],ymm1[5],ymm13[6],ymm1[6],ymm13[7],ymm1[7],ymm13[12],ymm1[12],ymm13[13],ymm1[13],ymm13[14],ymm1[14],ymm13[15],ymm1[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -1393,7 +1395,8 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,4,0,6,5,4,0,6] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm1[0],ymm13[1],ymm1[1],ymm13[2],ymm1[2],ymm13[3],ymm1[3],ymm13[8],ymm1[8],ymm13[9],ymm1[9],ymm13[10],ymm1[10],ymm13[11],ymm1[11] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -2810,7 +2813,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1 @@ -3085,7 +3089,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 @@ -3653,7 +3658,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[1],ymm13[1],ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[8],ymm13[8],ymm1[9],ymm13[9],ymm1[10],ymm13[10],ymm1[11],ymm13[11] ; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm10[4],ymm5[5],ymm10[5],ymm5[6],ymm10[6],ymm5[7],ymm10[7],ymm5[12],ymm10[12],ymm5[13],ymm10[13],ymm5[14],ymm10[14],ymm5[15],ymm10[15] -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm11, %ymm10 ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm10[2,2,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] @@ -4062,7 +4068,8 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [12,1,2,13,4,5,14,7] ; AVX512DQ-FAST-NEXT: vpermt2d %ymm3, %ymm9, %ymm22 ; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6489,7 +6496,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1 @@ -7207,7 +7215,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] @@ -8302,7 +8311,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm12, %zmm2, %zmm20 ; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512F-ONLY-FAST-NEXT: vpermt2d %ymm12, %ymm6, %ymm2 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm8, %ymm24 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = @@ -9033,7 +9043,8 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpermt2d %ymm6, %ymm19, %ymm21 ; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm2 ; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm6 ; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm6[2,2,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 8cd6e2c38b429..da36f165bd4c5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -4067,7 +4067,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm5 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] @@ -4327,7 +4327,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4354,7 +4355,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm15 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[0,0,2,1,4,4,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7,8,9,10],ymm10[11],ymm3[12,13],ymm10[14],ymm3[15] @@ -4519,12 +4520,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2,3],xmm8[4],xmm13[5,6],xmm8[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm13 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm3, %xmm14 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload @@ -4791,11 +4792,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm7 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm2, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[1,1,2,3] @@ -4908,7 +4909,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload @@ -4932,7 +4934,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload @@ -5142,7 +5144,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] ; AVX512F-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] @@ -5330,325 +5332,651 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; -; AVX512F-FAST-LABEL: store_i16_stride7_vf32: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 -; AVX512F-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2 -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] -; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15> -; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] -; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vprold $16, %ymm9, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] -; AVX512F-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3 -; AVX512F-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11> -; AVX512F-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm9 -; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-FAST-NEXT: vprold $16, %ymm15, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-FAST-NEXT: vmovdqa64 (%rax), %zmm27 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = -; AVX512F-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512F-FAST-NEXT: vprold $16, %xmm10, %xmm2 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-FAST-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7> -; AVX512F-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm27 = mem[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] -; AVX512F-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm2 = mem[2,1,3,3] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3] -; AVX512F-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[2,1,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17 -; AVX512F-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = mem[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm6 = mem[0,0,1,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 -; AVX512F-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 -; AVX512F-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4 -; AVX512F-FAST-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512F-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 -; AVX512F-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = -; AVX512F-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) -; AVX512F-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf32: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm10, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7> +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i16_stride7_vf32: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm4 +; AVX512DQ-FAST-NEXT: vporq %ymm3, %ymm4, %ymm19 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm14, %ymm2 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vporq %ymm1, %ymm2, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm4, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,12,13,u,15> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vprold $16, %ymm9, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm2, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm4, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm28 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7,8,9],ymm3[10],ymm7[11,12],ymm3[13],ymm7[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <0,1,u,3,10,10,11,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm5, %zmm17, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm9 +; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm5, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpandn %ymm5, %ymm7, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm2[3],ymm6[4,5],ymm2[6],ymm6[7,8,9,10],ymm2[11],ymm6[12,13],ymm2[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5,6,7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm27 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm29 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm27, %zmm9, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm10, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2],xmm8[3,4],xmm2[5],xmm8[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm6 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5,6,7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2],ymm5[3],ymm15[4,5],ymm5[6],ymm15[7,8,9,10],ymm5[11],ymm15[12,13],ymm5[14],ymm15[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7,8,9],ymm13[10],ymm15[11,12],ymm13[13],ymm15[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm7[2],ymm15[3,4],ymm7[5],ymm15[6,7,8,9],ymm7[10],ymm15[11,12],ymm7[13],ymm15[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <6,u,u,u,7,u,u,7> +; AVX512DQ-FAST-NEXT: vpermd %ymm11, %ymm15, %ymm15 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm27 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $246, (%rsp), %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm24[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm22[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm3, %zmm24, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm28 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm25, %zmm17 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm17 +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm19[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm22 = ymm18[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm26[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm20[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm30 = ymm16[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm29 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm21, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm2, %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm3 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm11, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 384(%rax) +; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride7_vf32: ; AVX512BW: # %bb.0: @@ -8665,7 +8993,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[1,1,2,2] @@ -8953,7 +9281,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm7 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm11, %ymm7, %ymm8 ; AVX2-SLOW-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload @@ -9211,7 +9540,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] @@ -9249,7 +9578,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] @@ -9338,7 +9668,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm13, %ymm14 ; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm15, %ymm11 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15] @@ -9389,7 +9720,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] @@ -9713,11 +10045,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm1 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,2,2] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -10241,12 +10573,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm9, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm14, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -10400,7 +10732,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload @@ -10443,7 +10776,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm7, %ymm14, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -10483,7 +10816,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm15, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6] @@ -10597,7 +10931,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm13, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm13, %ymm15, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7] @@ -10777,7 +11112,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,3,2,10,10,10,11] ; AVX512F-SLOW-NEXT: vpermi2q %zmm8, %zmm0, %zmm10 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512F-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, %ymm10 ; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm8, %ymm17 @@ -10998,7 +11334,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%rcx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdx), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm9 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[1,1,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] @@ -11416,728 +11752,1465 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; -; AVX512F-FAST-LABEL: store_i16_stride7_vf64: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $2200, %rsp # imm = 0x898 -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %ymm15 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm3 -; AVX512F-FAST-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 -; AVX512F-FAST-NEXT: vporq %ymm4, %ymm5, %ymm16 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm5 -; AVX512F-FAST-NEXT: vporq %ymm4, %ymm5, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm5 -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa %ymm6, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm5 -; AVX512F-FAST-NEXT: vporq %ymm4, %ymm5, %ymm31 -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm4 -; AVX512F-FAST-NEXT: vpor %ymm2, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512F-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 -; AVX512F-FAST-NEXT: vporq %ymm9, %ymm0, %ymm22 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] -; AVX512F-FAST-NEXT: vprold $16, %ymm5, %ymm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] -; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm23[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm23[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8,9,10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm15[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm15[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm15[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm23[1,1,1,1,5,5,5,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6> -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15],zero,zero,ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17],zero,zero,ymm9[u,u],zero,zero -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm15 -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm15 -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vprold $16, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 64(%rax), %zmm23 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-FAST-NEXT: vmovdqa 96(%rax), %ymm0 -; AVX512F-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7] -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8 -; AVX512F-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 -; AVX512F-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa 64(%rax), %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm27 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 64(%r8), %xmm9 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15] -; AVX512F-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512F-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm4 -; AVX512F-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm8 -; AVX512F-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 -; AVX512F-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> -; AVX512F-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm0 -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2 -; AVX512F-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm21, %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm5, %ymm11 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-FAST-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,0,2,1,4,4,6,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm23, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9] -; AVX512F-FAST-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm14 -; AVX512F-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpbroadcastd 96(%rax), %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vprold $16, %xmm0, %xmm10 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[1,1,2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm30 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512F-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdx), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm8 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm4[1],xmm11[2,3],xmm4[4],xmm11[5,6],xmm4[7] -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 -; AVX512F-FAST-NEXT: vpbroadcastd 100(%rax), %ymm0 -; AVX512F-FAST-NEXT: vpbroadcastd 104(%rax), %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm8, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm17 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-FAST-NEXT: vprold $16, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3,4],xmm5[5],xmm0[6,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX512F-FAST-NEXT: vmovdqa %xmm7, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11] -; AVX512F-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5 -; AVX512F-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] -; AVX512F-FAST-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3,4],xmm9[5],xmm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm9 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm29 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm2 -; AVX512F-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512F-FAST-NEXT: vpbroadcastd (%rax), %ymm6 -; AVX512F-FAST-NEXT: vpbroadcastd 4(%rax), %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,1,5,5,5,5] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm14 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512F-FAST-NEXT: vprold $16, %ymm3, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] -; AVX512F-FAST-NEXT: vmovdqa %ymm9, %ymm10 -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] -; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm6 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[2,2,2,2,6,6,6,6] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm21, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = -; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[3,3,3,3,7,7,7,7] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9] -; AVX512F-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm19, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm22 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9] -; AVX512F-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 -; AVX512F-FAST-NEXT: vpbroadcastd 36(%rax), %ymm1 -; AVX512F-FAST-NEXT: vpbroadcastd 40(%rax), %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm10 -; AVX512F-FAST-NEXT: vprold $16, %xmm0, %xmm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7] -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] -; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm23 = mem[0,2,2,3] -; AVX512F-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm4 = mem[2,1,3,3] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm3 = mem[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = mem[0,2,2,3] -; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; AVX512F-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,2,3,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,3,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] -; AVX512F-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm6 = mem[2,1,3,2] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[2,2,2,3] -; AVX512F-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[0,2,2,3] -; AVX512F-FAST-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[2,1,3,3] -; AVX512F-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[2,2,2,3] -; AVX512F-FAST-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm5 = mem[0,2,2,3] -; AVX512F-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm31 = mem[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm30 = mem[0,2,2,3] -; AVX512F-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm29 = mem[2,1,3,2] -; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm28 = mem[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm8 = mem[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm12[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm12 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm11[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm27[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm26[2,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm24[2,1,3,2] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm20[2,2,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm23, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm3, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm20 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm15 -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm23, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm13, %zmm23, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm4, %ymm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm9 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm9 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm21, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm21 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm17 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm4 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm25, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm26 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm29 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm0 = mem[0,2,2,3] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm7 = mem[2,1,3,3] -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm8 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm8 = mem[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm27 = mem[0,0,1,3] -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm24 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm12 = mem[0,2,2,3] -; AVX512F-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm25 = mem[0,1,1,3] -; AVX512F-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm15 = mem[2,1,3,3] -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm16 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm28 = mem[0,0,2,1] -; AVX512F-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm18 = mem[0,0,1,3] -; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm19 = mem[0,0,1,1] -; AVX512F-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm12, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm0, %zmm15 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm22 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm24, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm23, %zmm3 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm28, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm23, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm8 -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm0, %zmm10 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13 -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm21, 448(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 704(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 768(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm9, 832(%rax) -; AVX512F-FAST-NEXT: addq $2200, %rsp # imm = 0x898 -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: store_i16_stride7_vf64: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $2200, %rsp # imm = 0x898 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm3, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm5, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm5, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm4, %ymm5, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vporq %ymm9, %ymm0, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm5, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm23[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm23[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8,9,10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm15[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm15[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm15[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm23[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6> +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15],zero,zero,ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17],zero,zero,ymm9[u,u],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm1, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rax), %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r9), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%r8), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermi2d %zmm0, %zmm21, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r9), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm3, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 96(%rax), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 96(%rdx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm15, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm4[1],xmm11[2,3],xmm4[4],xmm11[5,6],xmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm15, %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 100(%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 104(%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm8, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3,4],xmm5[5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm7, %xmm13 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3,4],xmm9[5],xmm0[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm13, %xmm29 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm14, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,1,5,5,5,5] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vprold $16, %ymm3, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[2,2,2,2,6,6,6,6] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[3,3,3,3,7,7,7,7] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm19, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 36(%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpbroadcastd 40(%rax), %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm17, %xmm15 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512F-ONLY-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] +; AVX512F-ONLY-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm3 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm31 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm30 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm29 = mem[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm12[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm11[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm27[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm26[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm24[2,1,3,2] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm20[2,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm3, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm13, %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm4, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm25, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm25 = mem[0,1,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[2,1,3,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm18 = mem[0,0,1,3] +; AVX512F-ONLY-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,0,1,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm0, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm24, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm23, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm7, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $2200, %rsp # imm = 0x898 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i16_stride7_vf64: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $2200, %rsp # imm = 0x898 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm3, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19> +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm5, %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm5 +; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm5, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm14, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm6, %ymm10 +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm5 +; AVX512DQ-FAST-NEXT: vporq %ymm4, %ymm5, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm5, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm13 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm4 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512DQ-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm5, %ymm9 +; AVX512DQ-FAST-NEXT: vporq %ymm9, %ymm0, %ymm22 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7,8,9],ymm9[10],ymm0[11,12],ymm9[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vprold $16, %ymm5, %ymm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm8, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u> +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm23[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm23[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8,9,10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm15[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u> +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm15[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2,3],ymm0[4],ymm8[5,6,7,8],ymm0[9],ymm8[10,11],ymm0[12],ymm8[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm15[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm23[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <5,u,u,u,6,u,u,6> +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512DQ-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15],zero,zero,ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17],zero,zero,ymm9[u,u],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm16, %ymm8, %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm15 +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm15 +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm8, %ymm8 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm0 +; AVX512DQ-FAST-NEXT: vprold $16, %ymm1, %ymm8 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpermi2d %zmm23, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm8 +; AVX512DQ-FAST-NEXT: vpandnq %ymm8, %ymm16, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpbroadcastd 72(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rax), %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm10, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r9), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%r8), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,0,1,1,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm24, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpandn %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm24, %zmm15 +; AVX512DQ-FAST-NEXT: vpternlogq $248, %zmm26, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm11, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm15 = ymm7[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,u,3,10,10,11,11> +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm22, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm17, %ymm2 +; AVX512DQ-FAST-NEXT: vpandnq %ymm2, %ymm16, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm11, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm20, %ymm6 +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm11, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm21, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm15 +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm13, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [14,21,0,0,15,22,0,15,14,21,0,0,15,22,0,15] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermi2d %zmm0, %zmm21, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm30, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm7 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm12 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm2, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8,9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FAST-NEXT: # ymm22 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r9), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm3, %xmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm2, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm3 +; AVX512DQ-FAST-NEXT: vpbroadcastd 96(%rax), %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 96(%rdx), %xmm10 +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm0, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm15, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm4[1],xmm11[2,3],xmm4[4],xmm11[5,6],xmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm15, %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd 100(%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 104(%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm8, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3,4],xmm5[5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm7, %xmm13 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,1,8,9,9,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd 64(%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vpbroadcastd 68(%rax), %ymm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2],xmm0[3,4],xmm9[5],xmm0[6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm13, %xmm29 +; AVX512DQ-FAST-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpbroadcastd (%rax), %ymm6 +; AVX512DQ-FAST-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vprold $16, %ymm3, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[1,2,2,3,5,6,6,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,0,2,1,4,4,6,5] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm12, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7,8,9],ymm8[10],ymm0[11,12],ymm8[13],ymm0[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x8 {{.*#+}} zmm8 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FAST-NEXT: # zmm8 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm6, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm3, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm19, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vpbroadcastd 32(%rax), %ymm11 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm8 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,3,8,8,9,9] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpbroadcastd 36(%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vpbroadcastd 40(%rax), %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm10 +; AVX512DQ-FAST-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm17, %xmm15 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm15, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,2] +; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm4 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm3 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} ymm9 = ymm5[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm5[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm5[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm6 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermpd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm31 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm30 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm29 = mem[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm28 = mem[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm18[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm8[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm12[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm16 = ymm11[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm27[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm27 = ymm26[2,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm25 = ymm24[2,1,3,2] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm24 = ymm20[2,2,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm20, %ymm15 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm13, %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm20 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogd $226, 124(%r8){1to8}, %ymm4, %ymm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm13 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm24, %zmm25, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm8, %zmm0, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[0,1,1,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,2,2,3] +; AVX512DQ-FAST-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm25 = mem[0,1,1,3] +; AVX512DQ-FAST-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm15 = mem[2,1,3,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpermq $208, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm18 = mem[0,0,1,3] +; AVX512DQ-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,0,1,1] +; AVX512DQ-FAST-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm25, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm12 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm3, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm0, %zmm15 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm22 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm24, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm23, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm7, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm5 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm13 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 832(%rax) +; AVX512DQ-FAST-NEXT: addq $2200, %rsp # imm = 0x898 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-LABEL: store_i16_stride7_vf64: ; AVX512BW: # %bb.0: @@ -12391,11 +13464,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} -; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} -; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index 5d478ae0f3e25..4c7b0bcdc11c1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -619,10 +619,12 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm11[2,3],ymm9[2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,u,u,u,u,u,7,u> +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,0,7,0,6,0,7,0] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,6,0,7,0,6,0,7] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index ab3122960f53c..8d92086c44346 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -1177,9 +1177,11 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512F-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512F-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <6,u,u,u,u,23,31,7> +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512F-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 @@ -1221,9 +1223,11 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm4 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm5 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [31,7,15,23,31,7,15,23] +; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <6,u,u,u,u,23,31,7> +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index c94df69efc80d..664be09b5118a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -617,7 +617,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] +; AVX512F-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512F-NEXT: movb $12, %r10b ; AVX512F-NEXT: kmovw %r10d, %k1 @@ -663,7 +664,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] ; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] +; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> @@ -702,7 +704,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,12,4,12] +; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 ; AVX512BW-NEXT: movb $12, %r10b ; AVX512BW-NEXT: kmovd %r10d, %k1 @@ -748,7 +751,8 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,7,15] +; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm4, %zmm11 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm11[0,1,2,3],zmm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u> @@ -1462,281 +1466,1125 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i64_stride6_vf16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512F-NEXT: movb $12, %r10b -; AVX512F-NEXT: kmovw %r10d, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512F-NEXT: movb $16, %r10b -; AVX512F-NEXT: kmovw %r10d, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512F-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512F-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512F-NEXT: movb $48, %r9b -; AVX512F-NEXT: kmovw %r9d, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512F-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512F-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512F-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512F-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512F-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512F-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm22 = -; AVX512F-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf16: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: movb $12, %r10b +; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: movb $16, %r10b +; AVX512F-ONLY-SLOW-NEXT: kmovw %r10d, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: movb $48, %r9b +; AVX512F-ONLY-SLOW-NEXT: kmovw %r9d, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i64_stride6_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 -; AVX512BW-NEXT: movb $12, %r10b -; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} -; AVX512BW-NEXT: movb $16, %r10b -; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm10 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 -; AVX512BW-NEXT: movb $48, %r9b -; AVX512BW-NEXT: kmovd %r9d, %k2 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 -; AVX512BW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm20 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm21 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = -; AVX512BW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm25 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf16: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: movb $12, %r10b +; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: movb $16, %r10b +; AVX512F-ONLY-FAST-NEXT: kmovw %r10d, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512F-ONLY-FAST-NEXT: movb $48, %r9b +; AVX512F-ONLY-FAST-NEXT: kmovw %r9d, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf16: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: movb $12, %r10b +; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: movb $16, %r10b +; AVX512DQ-SLOW-NEXT: kmovw %r10d, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512DQ-SLOW-NEXT: movb $48, %r9b +; AVX512DQ-SLOW-NEXT: kmovw %r9d, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] +; AVX512DQ-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i64_stride6_vf16: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: movb $12, %r10b +; AVX512DQ-FAST-NEXT: kmovw %r10d, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: movb $16, %r10b +; AVX512DQ-FAST-NEXT: kmovw %r10d, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512DQ-FAST-NEXT: movb $48, %r9b +; AVX512DQ-FAST-NEXT: kmovw %r9d, %k2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm22 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf16: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: movb $12, %r10b +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: movb $16, %r10b +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: movb $48, %r9b +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r9d, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf16: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: movb $12, %r10b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: movb $16, %r10b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r10d, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: movb $48, %r9b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r9d, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf16: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: movb $12, %r10b +; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: movb $16, %r10b +; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512DQBW-SLOW-NEXT: movb $48, %r9b +; AVX512DQBW-SLOW-NEXT: kmovd %r9d, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf16: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: movb $12, %r10b +; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: movb $16, %r10b +; AVX512DQBW-FAST-NEXT: kmovd %r10d, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 {%k2} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm11, %zmm7, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm18 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm18, %zmm15 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm12, %zmm9 +; AVX512DQBW-FAST-NEXT: movb $48, %r9b +; AVX512DQBW-FAST-NEXT: kmovd %r9d, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm9 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm19 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm19, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm21, %zmm22 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-FAST-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm17, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm15 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = <0,1,9,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm22, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm12 {%k2} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm19, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm6, %zmm5, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm22, %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm17 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm19, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm21 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm20[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm0, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm18, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm20, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm14, %zmm13, %zmm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm21[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm21, %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm19, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm20, %zmm19 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm20, %zmm21 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm22 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm22, %zmm13 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm21[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm14 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm14, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm21, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm24, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm25 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm25 = ymm25[1],mem[1],ymm25[3],mem[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm25, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm25 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm25, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm8, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm20[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm21, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm23, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm25, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, (%rax) +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64 @@ -3204,565 +4052,2261 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i64_stride6_vf32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm30 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm28 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = -; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512F-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512F-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512F-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512F-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512F-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512F-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512F-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: movb $12, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512F-NEXT: movb $48, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512F-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512F-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512F-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512F-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512F-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: movb $16, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512F-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512F-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512F-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512F-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512F-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512F-NEXT: addq $712, %rsp # imm = 0x2C8 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i64_stride6_vf32: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: movb $12, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: movb $48, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: movb $16, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i64_stride6_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm21 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm10 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm28 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm27 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm24 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm20 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] -; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] -; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] -; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 -; AVX512BW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 -; AVX512BW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 -; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: movb $12, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} -; AVX512BW-NEXT: movb $48, %al -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 -; AVX512BW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 -; AVX512BW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 -; AVX512BW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm11 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: movb $16, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload -; AVX512BW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm6 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1024(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512BW-NEXT: addq $712, %rsp # imm = 0x2C8 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: store_i64_stride6_vf32: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: movb $12, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: movb $48, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: movb $16, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: store_i64_stride6_vf32: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] +; AVX512DQ-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] +; AVX512DQ-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] +; AVX512DQ-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: movb $12, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: movb $48, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: movb $16, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i64_stride6_vf32: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] +; AVX512DQ-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] +; AVX512DQ-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: movb $12, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: movb $48, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k2 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: movb $16, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride6_vf32: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: movb $12, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: movb $48, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movb $16, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: store_i64_stride6_vf32: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: movb $12, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: movb $48, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: movb $16, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i64_stride6_vf32: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: movb $12, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: movb $48, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: movb $16, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: store_i64_stride6_vf32: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm20 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,12,5,13,4,12,5,13] +; AVX512DQBW-FAST-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm16 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm16 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm16, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm16, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm16, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm28, %zmm6, %zmm16 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [1,9,2,10,1,9,2,10] +; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm19, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [5,13,6,14,5,13,6,14] +; AVX512DQBW-FAST-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm18, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm19, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm18, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm19, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm18, %zmm26 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm29, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm28, %zmm31, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm29, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm31, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm29, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm24, %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm9, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm20, %zmm15, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm31, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm9, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm9, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm17, %zmm30, %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,8,1,9,0,8,1,9] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm3, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm10, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm12, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: movb $12, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: movb $48, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm10 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm10 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm12 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm8 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm22 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm25 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm26 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm19 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,1,9,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,1,2,9,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm13, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm17 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm13, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm13, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 {%k2} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,1,13,u,4,5,6,7> +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,13,4,5,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm2, %zmm26 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],mem[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, (%r8), %zmm11, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,8,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm8 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 64(%r8), %zmm23, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm13 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 128(%r8), %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vinserti32x4 $2, 192(%r8), %zmm3, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm11 = ymm11[1],mem[1],ymm11[3],mem[3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm11, %zmm3, %zmm11 +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm12 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm12 = zmm30[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: movb $16, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm23 = <10,u,2,3,4,5,11,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm23, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = <14,u,2,3,4,5,15,u> +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm30, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],mem[1],ymm3[3],mem[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm14 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm30, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],mem[1],ymm4[3],mem[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm27 # 64-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # zmm27 = zmm5[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm30, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],mem[1],ymm6[3],mem[3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm29, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm23, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm28 = [0,10,2,3,4,5,6,11] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm30, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,12,6,7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm29 = [0,14,2,3,4,5,6,15] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm29, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm28, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm7, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm29, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm28, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm7, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm29, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm28, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm7, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm29, %zmm23 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQBW-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64 @@ -6827,7 +9371,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm23 ; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm19 ; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm10 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512F-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 ; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6996,7 +9541,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512F-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 ; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vpermt2q %zmm23, %zmm1, %zmm24 ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512F-NEXT: vpermt2q %zmm22, %zmm25, %zmm9 @@ -7496,7 +10042,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm23 ; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm19 ; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,4,12] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm8, %zmm9 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7665,7 +10212,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,15,7,15] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm1, %zmm24 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm9 @@ -8169,16 +10717,8 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} -; AVX512BW-ONLY-FAST: {{.*}} -; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512BW-SLOW: {{.*}} -; AVX512DQ-FAST: {{.*}} -; AVX512DQ-SLOW: {{.*}} -; AVX512DQBW-FAST: {{.*}} -; AVX512DQBW-SLOW: {{.*}} ; AVX512F-FAST: {{.*}} -; AVX512F-ONLY-FAST: {{.*}} -; AVX512F-ONLY-SLOW: {{.*}} ; AVX512F-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index ef9165d5cbf8b..43d48efd579e2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -770,7 +770,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k1 @@ -799,7 +800,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: movb $-61, %sil @@ -892,7 +894,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512F-ONLY-FAST-NEXT: movb $24, %sil ; AVX512F-ONLY-FAST-NEXT: kmovw %esi, %k1 @@ -921,7 +924,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512F-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] ; AVX512F-ONLY-FAST-NEXT: movb $-61, %sil @@ -1007,7 +1011,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512DQ-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-SLOW-NEXT: movb $-61, %sil @@ -1041,7 +1046,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-SLOW-NEXT: movb $24, %sil ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k2 @@ -1128,7 +1134,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FAST-NEXT: movb $-61, %sil @@ -1174,7 +1181,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 ; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512DQ-FAST-NEXT: movb $24, %sil ; AVX512DQ-FAST-NEXT: kmovw %esi, %k2 @@ -1256,7 +1264,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: movb $24, %sil ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k1 @@ -1285,7 +1294,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-SLOW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-ONLY-SLOW-NEXT: movb $-61, %sil @@ -1378,7 +1388,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm7, %zmm6, %zmm11 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm6 ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm7 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %sil ; AVX512BW-ONLY-FAST-NEXT: kmovd %esi, %k1 @@ -1407,7 +1418,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [6,14,6,14,6,14,6,14] ; AVX512BW-ONLY-FAST-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm8 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,2,3],zmm8[4,5,6,7] ; AVX512BW-ONLY-FAST-NEXT: movb $-61, %sil @@ -1493,7 +1505,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQBW-SLOW-NEXT: movb $-61, %sil @@ -1527,7 +1540,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm14 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm7 ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm2, %zmm11 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512DQBW-SLOW-NEXT: movb $24, %sil ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k2 @@ -1614,7 +1628,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [6,14,6,14,6,14,6,14] ; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 ; AVX512DQBW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQBW-FAST-NEXT: movb $-61, %sil @@ -1660,7 +1675,8 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm7, %zmm13 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm10, %zmm7 ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm2, %zmm10 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm8 ; AVX512DQBW-FAST-NEXT: movb $24, %sil ; AVX512DQBW-FAST-NEXT: kmovd %esi, %k2 @@ -2497,9 +2513,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -2615,7 +2633,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: kmovw %esi, %k3 ; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22 ; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28 @@ -2708,7 +2727,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12 @@ -2768,7 +2788,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 ; AVX512F-ONLY-FAST-NEXT: movb $24, %dil @@ -2917,9 +2938,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -3038,7 +3061,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: kmovw %esi, %k3 ; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16 ; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23 @@ -3128,7 +3152,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 ; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm12 @@ -3186,7 +3211,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm26 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, %zmm24 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24 ; AVX512DQ-FAST-NEXT: movb $24, %dil @@ -3335,9 +3361,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm1 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm14, %zmm12 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -3453,7 +3481,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: kmovd %esi, %k3 ; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm2, %zmm22, %zmm6 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm7, %zmm8, %zmm22 ; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm7, %zmm28 @@ -3546,7 +3575,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm16 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm21, %zmm17 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm12 @@ -3606,7 +3636,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm25, %zmm13, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm14, %zmm26, %zmm29 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, %zmm25 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm25 ; AVX512BW-ONLY-FAST-NEXT: movb $24, %dil @@ -3755,9 +3786,11 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm13 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm14, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -3876,7 +3909,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: kmovd %esi, %k3 ; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k3} = zmm6[0],zmm2[0],zmm6[2],zmm2[2],zmm6[4],zmm2[4],zmm6[6],zmm2[6] ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm2, %zmm16, %zmm6 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm3 ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm4, %zmm16 ; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm0, %zmm23 @@ -3966,7 +4000,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm19, %zmm16 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm18 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm20, %zmm18 ; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm12 @@ -4024,7 +4059,8 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm24, %zmm13, %zmm28 ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm14, %zmm25, %zmm29 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm26 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, %zmm24 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm24 ; AVX512DQBW-FAST-NEXT: movb $24, %dil @@ -5919,9 +5955,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm30 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm13 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm8 @@ -6329,11 +6367,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9 -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 @@ -6389,7 +6429,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6 -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28 ; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] ; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 @@ -6801,9 +6842,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm29 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm3 -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm8 -; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm22 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm17 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm15 @@ -7211,10 +7254,12 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 -; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 @@ -7268,7 +7313,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1 ; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5 -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30 ; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] ; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 @@ -7674,9 +7720,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm30 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm19 -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm20, %zmm13 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm8 ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm3, %zmm8 @@ -8084,11 +8132,13 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm27 {%k3} = zmm1[0],zmm11[0],zmm1[2],zmm11[2],zmm1[4],zmm11[4],zmm1[6],zmm11[6] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm9 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm3, %zmm9 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm6, %zmm2 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm19 @@ -8144,7 +8194,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k3} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm28, %zmm6 -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm22, %zmm28 ; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k3} = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6] ; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 @@ -8556,9 +8607,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm4, %zmm29 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm3 -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm8 -; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm22 = +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm22 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm22 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm22, %zmm17 ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm15 ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm15 @@ -8966,10 +9019,12 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm26 {%k3} = zmm5[0],zmm3[0],zmm5[2],zmm3[2],zmm5[4],zmm3[4],zmm5[6],zmm3[6] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm9 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm9 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm25 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm25 -; AVX512DQBW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm16 @@ -9023,7 +9078,8 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm29 {%k3} = zmm30[0],zmm8[0],zmm30[2],zmm8[2],zmm30[4],zmm8[4],zmm30[6],zmm8[6] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm1 ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm30, %zmm5 -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm20 = +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm20 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm20 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm20, %zmm30 ; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm17 {%k3} = zmm2[0],zmm0[0],zmm2[2],zmm0[2],zmm2[4],zmm0[4],zmm2[6],zmm0[6] ; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm7 @@ -12937,10 +12993,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 @@ -13824,11 +13882,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -14836,10 +14896,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512DQ-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm19 ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 @@ -15721,11 +15783,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm23 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7] +; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -16728,10 +16792,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm8, %zmm7 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm10 ; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload ; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm3 @@ -17615,11 +17681,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm10, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [15,7,15,7] +; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm27, %zmm3 ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload @@ -18627,10 +18695,12 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm17, %zmm3 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm7 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [15,7,15,7] +; AVX512DQBW-SLOW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm7, %zmm19 ; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 @@ -19512,11 +19582,13 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm8, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm23 = +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm23 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm23 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQBW-FAST-NEXT: vmovdqa64 {{.*#+}} ymm28 = +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [15,7,15,7] +; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1] ; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm3 ; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index adcb0f5815815..083c206fe9356 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -821,7 +821,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] ; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] +; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm15 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 @@ -831,7 +832,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] ; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm4 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm13 @@ -842,7 +844,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] ; AVX512F-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} ; AVX512F-NEXT: vpermi2q %zmm9, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512F-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm15 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 @@ -853,7 +856,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} ; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512F-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm18, %zmm6 @@ -936,7 +940,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm5, %zmm13 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm13 {%k1} = zmm1[0],zmm3[0],zmm1[2],zmm3[2],zmm1[4],zmm3[4],zmm1[6],zmm3[6] ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [6,14,6,14] +; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm15 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm13, %zmm5 @@ -946,7 +951,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm13, %zmm15 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm1[1],zmm3[1],zmm1[3],zmm3[3],zmm1[5],zmm3[5],zmm1[7],zmm3[7] ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [7,15,7,15] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm13 @@ -957,7 +963,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm17 = zmm0[0],zmm2[0],zmm0[2],zmm2[2],zmm0[4],zmm2[4],zmm0[6],zmm2[6] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 {%k1} ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512BW-NEXT: # ymm15 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm15 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm17, %zmm4 @@ -968,7 +975,8 @@ define void @store_i64_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm18 = zmm0[1],zmm2[1],zmm0[3],zmm2[3],zmm0[5],zmm2[5],zmm0[7],zmm2[7] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1} ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm6 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,5,13] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermi2q %zmm8, %zmm7, %zmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm18, %zmm6 @@ -1896,423 +1904,1709 @@ define void @store_i64_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i64_stride8_vf16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512F-NEXT: movb $-64, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512F-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512F-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512F-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm19 = -; AVX512F-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512F-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512F-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512F-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512F-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512F-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512F-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf16: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512F-ONLY-SLOW-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i64_stride8_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm30 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm27 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm8 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm31 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm29 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %xmm16 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %xmm17 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm11 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm9 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] -; AVX512BW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm11 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 -; AVX512BW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm8 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm13 -; AVX512BW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm0, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 896(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 960(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 832(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 512(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm28, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf16: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: movb $-64, %al +; AVX512F-ONLY-FAST-NEXT: kmovw %eax, %k1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf16: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: movb $-64, %al +; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i64_stride8_vf16: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: movb $-64, %al +; AVX512DQ-FAST-NEXT: kmovw %eax, %k1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] +; AVX512DQ-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf16: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %al +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf16: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %al +; AVX512BW-ONLY-FAST-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf16: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: movb $-64, %al +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf16: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm29 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: movb $-64, %al +; AVX512DQBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %xmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %xmm17 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm6[1],xmm2[1] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm12[1],xmm11[1] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm10, %ymm20, %ymm10 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm3, %zmm21 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm20, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm20, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm6[0],xmm2[0] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm11[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm22 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm5[1],zmm30[1],zmm5[3],zmm30[3],zmm5[5],zmm30[5],zmm5[7],zmm30[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm3, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm23 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm5[0],zmm30[0],zmm5[2],zmm30[2],zmm5[4],zmm30[4],zmm5[6],zmm30[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm15 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm10, %zmm24 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm10, %zmm15 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm15 {%k1} = zmm8[1],zmm31[1],zmm8[3],zmm31[3],zmm8[5],zmm31[5],zmm8[7],zmm31[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm10, %zmm13 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm15, %zmm25 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm26, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 {%k1} = zmm8[0],zmm31[0],zmm8[2],zmm31[2],zmm8[4],zmm31[4],zmm8[6],zmm31[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm26, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm19 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm19 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm19, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm17[1],xmm16[1] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %xmm9 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm9[1],xmm7[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm13, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm14, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm20, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm17[0],xmm16[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm9[0],xmm7[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 = zmm0[1],zmm27[1],zmm0[3],zmm27[3],zmm0[5],zmm27[5],zmm0[7],zmm27[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm11 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm7, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm0[0],zmm27[0],zmm0[2],zmm27[2],zmm0[4],zmm27[4],zmm0[6],zmm27[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm12 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm10, %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm1[1],zmm29[1],zmm1[3],zmm29[3],zmm1[5],zmm29[5],zmm1[7],zmm29[7] +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm16, %zmm13, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm14, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm26, %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm1[0],zmm29[0],zmm1[2],zmm29[2],zmm1[4],zmm29[4],zmm1[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm26, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm19, %zmm14 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm7, %zmm4 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm7, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm7, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm11[1],ymm9[1],ymm11[3],ymm9[3] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm14[1],ymm13[1],ymm14[3],ymm13[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm15[2,3],ymm12[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm10, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm31, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm30, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm11[0],ymm9[0],ymm11[2],ymm9[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm27, %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm27, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm8[1],ymm13[3],ymm8[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm14[2,3],ymm12[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm13[0],ymm8[0],ymm13[2],ymm8[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 64(%rax) +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <16 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <16 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <16 x i64>, ptr %in.vecptr2, align 64 @@ -4154,1001 +5448,4029 @@ define void @store_i64_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i64_stride8_vf32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512F-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512F-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm20 -; AVX512F-NEXT: movb $-64, %r11b -; AVX512F-NEXT: kmovw %r11d, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm28 = -; AVX512F-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512F-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512F-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512F-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512F-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] -; AVX512F-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512F-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] -; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512F-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512F-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512F-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512F-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512F-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512F-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512F-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %ymm18 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512F-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512F-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512F-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512F-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 1536(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512F-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm29, 1024(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, (%rax) -; AVX512F-NEXT: addq $2632, %rsp # imm = 0xA48 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf32: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b +; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm14, 1984(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm13, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm8, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i64_stride8_vf32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovaps 128(%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm23 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovaps 192(%rdx), %zmm2 -; AVX512BW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovaps 128(%rdx), %zmm2 -; AVX512BW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm22 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm19 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm29 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm20 -; AVX512BW-NEXT: movb $-64, %r11b -; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm28 = -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm28 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm16 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm23 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm0 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm31 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm25 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rcx), %xmm1 -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] -; AVX512BW-NEXT: vmovdqa 128(%rsi), %xmm7 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rcx), %xmm13 -; AVX512BW-NEXT: vmovdqa 192(%rdx), %xmm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %xmm17 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %xmm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %ymm18 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm15 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %ymm18 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm19 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm12 -; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %ymm16 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 1664(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 1216(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 1152(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 704(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 640(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 128(%rax) -; AVX512BW-NEXT: vmovaps %zmm14, 1984(%rax) -; AVX512BW-NEXT: vmovaps %zmm13, 1920(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 1856(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 1536(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm27, 1472(%rax) -; AVX512BW-NEXT: vmovaps %zmm8, 1408(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 1344(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 1024(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm30, 576(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 512(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, (%rax) -; AVX512BW-NEXT: addq $2632, %rsp # imm = 0xA48 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf32: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b +; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm14, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm13, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm8, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf32: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQ-SLOW-NEXT: movb $-64, %r11b +; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm28 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm14, 1984(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm13, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovaps %zmm8, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i64_stride8_vf32: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-FAST-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQ-FAST-NEXT: movb $-64, %r11b +; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm28 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %xmm13 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm14, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm13, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovaps %zmm8, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQ-FAST-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf32: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm14, 1984(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm13, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm8, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf32: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm14, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm13, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm8, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf32: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b +; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm28 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %xmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %xmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %xmm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %ymm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %ymm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm14, 1984(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm13, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm8, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm30, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf32: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $2632, %rsp # imm = 0xA48 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovaps 192(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovaps 128(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm20 +; AVX512DQBW-FAST-NEXT: movb $-64, %r11b +; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm22[0],zmm29[0],zmm22[2],zmm29[2],zmm22[4],zmm29[4],zmm22[6],zmm29[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm14, %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm22[1],zmm29[1],zmm22[3],zmm29[3],zmm22[5],zmm29[5],zmm22[7],zmm29[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm7 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm9, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm15, %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm7 {%k1} = zmm18[0],zmm21[0],zmm18[2],zmm21[2],zmm18[4],zmm21[4],zmm18[6],zmm21[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm7, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm7 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm7 {%k1} = zmm18[1],zmm21[1],zmm18[3],zmm21[3],zmm18[5],zmm21[5],zmm18[7],zmm21[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm28 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm28 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm28, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm19[0],zmm25[0],zmm19[2],zmm25[2],zmm19[4],zmm25[4],zmm19[6],zmm25[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm4, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm19[1],zmm25[1],zmm19[3],zmm25[3],zmm19[5],zmm25[5],zmm19[7],zmm25[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm13, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm15, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm17[0],zmm20[0],zmm17[2],zmm20[2],zmm17[4],zmm20[4],zmm17[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm1 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm1 {%k1} = zmm17[1],zmm20[1],zmm17[3],zmm20[3],zmm17[5],zmm20[5],zmm17[7],zmm20[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm28, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm7, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm11, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm13, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm28, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm15, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm23, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm0 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm15, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm0, %zmm2, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm31, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm5, %zmm30 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm21, %zmm0, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm5, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm6, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm26, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm29, %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm29 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm26, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm0, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm6, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm26, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm25 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm25 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm28[0],zmm4[0],zmm28[2],zmm4[2],zmm28[4],zmm4[4],zmm28[6],zmm4[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm4[1],zmm28[3],zmm4[3],zmm28[5],zmm4[5],zmm28[7],zmm4[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm26, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm0, %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm6, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm26, %zmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm14 = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm17 = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm13, %zmm0, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm6, %zmm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 {%k1} = zmm23[0],zmm1[0],zmm23[2],zmm1[2],zmm23[4],zmm1[4],zmm23[6],zmm1[6] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm23[1],zmm1[1],zmm23[3],zmm1[3],zmm23[5],zmm1[5],zmm23[7],zmm1[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm26, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm3, %zmm31, %zmm26 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm31[0],zmm3[0],zmm31[2],zmm3[2],zmm31[4],zmm3[4],zmm31[6],zmm3[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm31[1],zmm3[1],zmm31[3],zmm3[3],zmm31[5],zmm3[5],zmm31[7],zmm3[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, %zmm24 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm3[0] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm0[0],xmm1[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm30, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm24, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %xmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm29[0],xmm4[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm2, %ymm22, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm29[1],xmm4[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %xmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm2[0],xmm1[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %xmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm19 = xmm8[0],xmm7[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm4, %ymm19, %ymm4 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm9, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm7[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm15, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm2 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm14, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm7 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm7, %zmm17, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm8, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm9 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm27, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %xmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %xmm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm15 = xmm14[0],xmm13[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %xmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm25 = xmm19[0],xmm17[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm15, %ymm25, %ymm15 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm15, %zmm5, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm14[1],xmm13[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm19[1],xmm17[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm13, %ymm14, %ymm13 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm10, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm13, %zmm11, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm13 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm13, %zmm0, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vblendps $240, (%rsp), %ymm0, %ymm14 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm14 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vinsertf64x4 $0, %ymm14, %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %ymm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm15[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm20 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %ymm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %ymm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm15 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm15[2,3],ymm9[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm9, %zmm1, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %ymm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm17[0],ymm15[0],ymm17[2],ymm15[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %ymm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %ymm19 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm19[0],ymm18[0],ymm19[2],ymm18[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm12, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm17[1],ymm15[1],ymm17[3],ymm15[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm19[1],ymm18[1],ymm19[3],ymm18[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm16, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm15 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %ymm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm17[0],ymm16[0],ymm17[2],ymm16[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm15[1],ymm12[1],ymm15[3],ymm12[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm17[1],ymm16[1],ymm17[3],ymm16[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm12[2,3],ymm2[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm31, %zmm2 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm14, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm13, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 1792(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovaps %zmm8, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm30, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, (%rax) +; AVX512DQBW-FAST-NEXT: addq $2632, %rsp # imm = 0xA48 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <32 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <32 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <32 x i64>, ptr %in.vecptr2, align 64 @@ -8927,2033 +13249,8149 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: store_i64_stride8_vf64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: subq $5384, %rsp # imm = 0x1508 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512F-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512F-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512F-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512F-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512F-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512F-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512F-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512F-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512F-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512F-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512F-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512F-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512F-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512F-NEXT: vmovdqa64 (%rax), %zmm19 -; AVX512F-NEXT: movb $-64, %r11b -; AVX512F-NEXT: kmovw %r11d, %k1 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512F-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512F-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512F-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512F-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512F-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm27 = -; AVX512F-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512F-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512F-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512F-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512F-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512F-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512F-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512F-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512F-NEXT: vmovdqa64 320(%rsi), %zmm17 -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512F-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512F-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 -; AVX512F-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512F-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512F-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512F-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512F-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512F-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512F-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512F-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512F-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512F-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512F-NEXT: vmovdqa64 448(%r10), %zmm28 -; AVX512F-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%rdi), %zmm5 -; AVX512F-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512F-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 -; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512F-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512F-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512F-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512F-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512F-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512F-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 -; AVX512F-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 -; AVX512F-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512F-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512F-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512F-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512F-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512F-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512F-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512F-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512F-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512F-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512F-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512F-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512F-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512F-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512F-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 -; AVX512F-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512F-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512F-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512F-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rcx), %ymm0 -; AVX512F-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 128(%rsi), %ymm8 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512F-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512F-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rcx), %ymm0 -; AVX512F-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512F-NEXT: vmovdqa 256(%rcx), %ymm0 -; AVX512F-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512F-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512F-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512F-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512F-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512F-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} -; AVX512F-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512F-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512F-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512F-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512F-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512F-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512F-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512F-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512F-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512F-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512F-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512F-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512F-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512F-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512F-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512F-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512F-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512F-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512F-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512F-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512F-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512F-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512F-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512F-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512F-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512F-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512F-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512F-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512F-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512F-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512F-NEXT: vmovdqa64 256(%rsi), %xmm30 -; AVX512F-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512F-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512F-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512F-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] -; AVX512F-NEXT: vmovdqa64 320(%rsi), %xmm31 -; AVX512F-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512F-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512F-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512F-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512F-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512F-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512F-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512F-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512F-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512F-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512F-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512F-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512F-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512F-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512F-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm4, 2816(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512F-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512F-NEXT: addq $5384, %rsp # imm = 0x1508 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512F-ONLY-SLOW-LABEL: store_i64_stride8_vf64: +; AVX512F-ONLY-SLOW: # %bb.0: +; AVX512F-ONLY-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512F-ONLY-SLOW-NEXT: movb $-64, %r11b +; AVX512F-ONLY-SLOW-NEXT: kmovw %r11d, %k1 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512F-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512F-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512F-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] +; AVX512F-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] +; AVX512F-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512F-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 +; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512F-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512F-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1216(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 1152(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 704(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 640(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 192(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 4032(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3968(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3904(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3840(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512F-ONLY-SLOW-NEXT: vzeroupper +; AVX512F-ONLY-SLOW-NEXT: retq ; -; AVX512BW-LABEL: store_i64_stride8_vf64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: subq $5384, %rsp # imm = 0x1508 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %zmm16 -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm24 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm20 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm25 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm22 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm18 -; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm23 -; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm21 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm26 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm19 -; AVX512BW-NEXT: movb $-64, %r11b -; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] -; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] -; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] -; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm27 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm27 = -; AVX512BW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm13 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%r10), %zmm9 -; AVX512BW-NEXT: vmovdqa64 192(%rax), %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%r9), %zmm10 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 256(%r10), %zmm9 -; AVX512BW-NEXT: vmovdqa64 256(%rax), %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 256(%r9), %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %zmm8 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 448(%rcx), %zmm4 -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%r10), %zmm26 -; AVX512BW-NEXT: vmovdqa64 320(%rax), %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqa64 320(%r8), %zmm13 -; AVX512BW-NEXT: vmovdqa64 320(%r9), %zmm15 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm5 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r10), %zmm29 -; AVX512BW-NEXT: vmovdqa64 384(%rax), %zmm3 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%r8), %zmm25 -; AVX512BW-NEXT: vmovdqa64 384(%r9), %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm31 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 -; AVX512BW-NEXT: vmovdqa64 448(%r10), %zmm28 -; AVX512BW-NEXT: vmovdqa64 448(%rax), %zmm9 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %zmm1 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%r8), %zmm17 -; AVX512BW-NEXT: vmovdqa64 448(%r9), %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm30 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 -; AVX512BW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm24 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] -; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] -; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] -; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 -; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 -; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm4 -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 -; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm22 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm23 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 -; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm15 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18 -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] -; AVX512BW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 -; AVX512BW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] -; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512BW-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rcx), %ymm0 -; AVX512BW-NEXT: vmovdqa 128(%rdx), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 128(%rsi), %ymm8 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rcx), %ymm0 -; AVX512BW-NEXT: vmovdqa 192(%rdx), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 192(%rsi), %ymm4 -; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512BW-NEXT: vmovdqa 256(%rcx), %ymm0 -; AVX512BW-NEXT: vmovdqa 256(%rdx), %ymm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] -; AVX512BW-NEXT: vmovdqa 256(%rsi), %ymm4 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm8 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512BW-NEXT: vmovdqa 320(%rcx), %ymm1 -; AVX512BW-NEXT: vmovdqa 320(%rdx), %ymm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-NEXT: vmovdqa 320(%rsi), %ymm4 -; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} -; AVX512BW-NEXT: vmovdqa 384(%rcx), %ymm1 -; AVX512BW-NEXT: vmovdqa 384(%rdx), %ymm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-NEXT: vmovdqa 384(%rsi), %ymm7 -; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 -; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 448(%rcx), %ymm1 -; AVX512BW-NEXT: vmovdqa 448(%rdx), %ymm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] -; AVX512BW-NEXT: vmovdqa 448(%rsi), %ymm7 -; AVX512BW-NEXT: vmovdqa 448(%rdi), %ymm9 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512BW-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] -; AVX512BW-NEXT: vmovdqa 64(%rsi), %xmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %xmm17 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 128(%rcx), %xmm12 -; AVX512BW-NEXT: vmovdqa 128(%rdx), %xmm14 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] -; AVX512BW-NEXT: vmovdqa64 128(%rsi), %xmm20 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %xmm26 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] -; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 192(%rcx), %xmm14 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %xmm20 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] -; AVX512BW-NEXT: vmovdqa64 192(%rsi), %xmm28 -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %xmm29 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %xmm20 -; AVX512BW-NEXT: vmovdqa64 256(%rdx), %xmm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] -; AVX512BW-NEXT: vmovdqa64 256(%rsi), %xmm30 -; AVX512BW-NEXT: vmovdqa 256(%rdi), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %xmm20 -; AVX512BW-NEXT: vmovdqa64 320(%rdx), %xmm28 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] -; AVX512BW-NEXT: vmovdqa64 320(%rsi), %xmm31 -; AVX512BW-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %xmm20 -; AVX512BW-NEXT: vmovdqa64 384(%rdx), %xmm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512BW-NEXT: vmovdqa64 384(%rsi), %xmm28 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %xmm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 448(%rcx), %xmm20 -; AVX512BW-NEXT: vmovdqa64 448(%rdx), %xmm23 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] -; AVX512BW-NEXT: vmovdqa64 448(%rsi), %xmm28 -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm30 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] -; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 -; AVX512BW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm9, 3776(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 3712(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm25, 3264(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm18, 3200(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm13, 2752(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 2688(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 2240(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm19, 2176(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 1728(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm24, 1664(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 1216(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 1152(%rax) -; AVX512BW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 704(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 640(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 192(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 128(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 4032(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 3968(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 3904(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 3840(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 3648(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 3584(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 3520(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 3456(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 3392(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 3328(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 3136(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 3072(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 3008(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 2944(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 2880(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm4, 2816(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 2624(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm22, 2560(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2496(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2432(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2368(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 2304(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, 2112(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm29, 2048(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1984(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1920(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1856(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1792(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 1600(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm26, 1536(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1472(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1408(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1344(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 1280(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1088(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 1024(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 960(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 896(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 832(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 768(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, 576(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax) -; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512BW-NEXT: addq $5384, %rsp # imm = 0x1508 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: store_i64_stride8_vf64: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512F-ONLY-FAST-NEXT: movb $-64, %r11b +; AVX512F-ONLY-FAST-NEXT: kmovw %r11d, %k1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512F-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512F-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512F-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] +; AVX512F-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512F-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512F-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] +; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512F-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] +; AVX512F-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512F-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1216(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 1152(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 704(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 640(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 4032(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3968(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3904(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3840(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512F-ONLY-FAST-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-SLOW-LABEL: store_i64_stride8_vf64: +; AVX512DQ-SLOW: # %bb.0: +; AVX512DQ-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512DQ-SLOW-NEXT: movb $-64, %r11b +; AVX512DQ-SLOW-NEXT: kmovw %r11d, %k1 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512DQ-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQ-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512DQ-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512DQ-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 +; AVX512DQ-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512DQ-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQ-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512DQ-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] +; AVX512DQ-SLOW-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512DQ-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512DQ-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512DQ-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 +; AVX512DQ-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] +; AVX512DQ-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7 +; AVX512DQ-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQ-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512DQ-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] +; AVX512DQ-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12 +; AVX512DQ-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] +; AVX512DQ-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512DQ-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31 +; AVX512DQ-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512DQ-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512DQ-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 +; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512DQ-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512DQ-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512DQ-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1216(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 1152(%rax) +; AVX512DQ-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 704(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 640(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 192(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 4032(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3968(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3904(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3840(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512DQ-SLOW-NEXT: vzeroupper +; AVX512DQ-SLOW-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i64_stride8_vf64: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%r10), %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512DQ-FAST-NEXT: movb $-64, %r11b +; AVX512DQ-FAST-NEXT: kmovw %r11d, %k1 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512DQ-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQ-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512DQ-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512DQ-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQ-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512DQ-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQ-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] +; AVX512DQ-FAST-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%r10), %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r10), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rax), %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r8), %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%r9), %zmm15 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQ-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r10), %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r8), %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%r9), %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r8), %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQ-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512DQ-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512DQ-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512DQ-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 +; AVX512DQ-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] +; AVX512DQ-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rsi), %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rsi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdx), %ymm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rsi), %ymm7 +; AVX512DQ-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQ-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] +; AVX512DQ-FAST-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rcx), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 128(%rdx), %xmm14 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] +; AVX512DQ-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa 192(%rcx), %xmm14 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31 +; AVX512DQ-FAST-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512DQ-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512DQ-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512DQ-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512DQ-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512DQ-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1216(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 1152(%rax) +; AVX512DQ-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 704(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 640(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 4032(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3968(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3904(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3840(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FAST-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq +; +; AVX512BW-ONLY-SLOW-LABEL: store_i64_stride8_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: movb $-64, %r11b +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r11d, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512BW-ONLY-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512BW-ONLY-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512BW-ONLY-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] +; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-ONLY-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1216(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 1152(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 704(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 640(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 4032(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3968(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3904(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3840(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq +; +; AVX512BW-ONLY-FAST-LABEL: store_i64_stride8_vf64: +; AVX512BW-ONLY-FAST: # %bb.0: +; AVX512BW-ONLY-FAST-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%r10), %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512BW-ONLY-FAST-NEXT: movb $-64, %r11b +; AVX512BW-ONLY-FAST-NEXT: kmovd %r11d, %k1 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512BW-ONLY-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512BW-ONLY-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512BW-ONLY-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [7,15,7,15] +; AVX512BW-ONLY-FAST-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%r10), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r10), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rax), %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r8), %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%r9), %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r10), %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r8), %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%r9), %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r8), %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512BW-ONLY-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512BW-ONLY-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512BW-ONLY-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512BW-ONLY-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] +; AVX512BW-ONLY-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512BW-ONLY-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rsi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdx), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdx), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rsi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdx), %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rsi), %ymm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512BW-ONLY-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rcx), %xmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 128(%rdx), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 192(%rcx), %xmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512BW-ONLY-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 +; AVX512BW-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512BW-ONLY-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512BW-ONLY-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512BW-ONLY-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512BW-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1216(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 1152(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 704(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 640(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 192(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 4032(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3968(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3904(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3840(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512BW-ONLY-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512BW-ONLY-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512BW-ONLY-FAST-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512BW-ONLY-FAST-NEXT: vzeroupper +; AVX512BW-ONLY-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i64_stride8_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%r10), %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512DQBW-SLOW-NEXT: movb $-64, %r11b +; AVX512DQBW-SLOW-NEXT: kmovd %r11d, %k1 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQBW-SLOW-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512DQBW-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 +; AVX512DQBW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512DQBW-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] +; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%r10), %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r10), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rax), %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r8), %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%r9), %zmm15 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r10), %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rax), %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r8), %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%r9), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r8), %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-SLOW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-SLOW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-SLOW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 +; AVX512DQBW-SLOW-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] +; AVX512DQBW-SLOW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rsi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdx), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdx), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rsi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdx), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rsi), %ymm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQBW-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 64(%rdi), %xmm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rcx), %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa 128(%rdx), %xmm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rsi), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 128(%rdi), %xmm26 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] +; AVX512DQBW-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa 192(%rcx), %xmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdx), %xmm20 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rsi), %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 192(%rdi), %xmm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rcx), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rdx), %xmm28 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512DQBW-SLOW-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rcx), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rdx), %xmm28 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 320(%rsi), %xmm31 +; AVX512DQBW-SLOW-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rcx), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdx), %xmm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 384(%rdi), %xmm30 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rcx), %xmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdx), %xmm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512DQBW-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512DQBW-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 3712(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm25, 3264(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 2688(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, 2240(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm19, 2176(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm21, 1728(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm24, 1664(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1216(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 1152(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 704(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 640(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 4032(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3968(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3904(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3840(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 3648(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm15, 3072(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 2624(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm22, 2560(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 2112(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm14, 1600(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm26, 1536(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm17, 1024(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-SLOW-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-SLOW-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq +; +; AVX512DQBW-FAST-LABEL: store_i64_stride8_vf64: +; AVX512DQBW-FAST: # %bb.0: +; AVX512DQBW-FAST-NEXT: subq $5384, %rsp # imm = 0x1508 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdi), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rsi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdx), %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rdx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rcx), %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r8), %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r8), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r9), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%r10), %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%r10), %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rax), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 (%rax), %zmm19 +; AVX512DQBW-FAST-NEXT: movb $-64, %r11b +; AVX512DQBW-FAST-NEXT: kmovd %r11d, %k1 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,12,4,12,4,12,4,12] +; AVX512DQBW-FAST-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm10 = zmm2[0],zmm18[0],zmm2[2],zmm18[2],zmm2[4],zmm18[4],zmm2[6],zmm18[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,12,4,12] +; AVX512DQBW-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm15, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [5,13,5,13,5,13,5,13] +; AVX512DQBW-FAST-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 = zmm2[1],zmm18[1],zmm2[3],zmm18[3],zmm2[5],zmm18[5],zmm2[7],zmm18[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm12, %zmm0 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [5,13,5,13] +; AVX512DQBW-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [6,14,6,14,6,14,6,14] +; AVX512DQBW-FAST-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm27 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm13, %zmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm21[0],zmm19[0],zmm21[2],zmm19[2],zmm21[4],zmm19[4],zmm21[6],zmm19[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm13, %zmm10 +; AVX512DQBW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,14,6,14] +; AVX512DQBW-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm13 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm5, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm10, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,7,15,7,15,7,15] +; AVX512DQBW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm10 {%k1} = zmm21[1],zmm19[1],zmm21[3],zmm19[3],zmm21[5],zmm19[5],zmm21[7],zmm19[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm0, %zmm8 +; AVX512DQBW-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [7,15,7,15] +; AVX512DQBW-FAST-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm11, %zmm27, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm24[0],zmm25[0],zmm24[2],zmm25[2],zmm24[4],zmm25[4],zmm24[6],zmm25[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm26, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm9 = zmm24[1],zmm25[1],zmm24[3],zmm25[3],zmm24[5],zmm25[5],zmm24[7],zmm25[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm5, %zmm9 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm24, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm11, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 {%k1} = zmm23[0],zmm26[0],zmm23[2],zmm26[2],zmm23[4],zmm26[4],zmm23[6],zmm26[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm9, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%r10), %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rax), %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm25, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm23[1],zmm26[1],zmm23[3],zmm26[3],zmm23[5],zmm26[5],zmm23[7],zmm26[7] +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 = zmm20[0],zmm22[0],zmm20[2],zmm22[2],zmm20[4],zmm22[4],zmm20[6],zmm22[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdx), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm20[1],zmm22[1],zmm20[3],zmm22[3],zmm20[5],zmm22[5],zmm20[7],zmm22[7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm13 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm10, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm18[0],zmm9[2],zmm18[2],zmm9[4],zmm18[4],zmm9[6],zmm18[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm4 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm18[1],zmm9[3],zmm18[3],zmm9[5],zmm18[5],zmm9[7],zmm18[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r10), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rax), %zmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r8), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%r9), %zmm10 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm2[0],zmm10[0],zmm2[2],zmm10[2],zmm2[4],zmm10[4],zmm2[6],zmm10[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm16[1],zmm10[1],zmm16[3],zmm10[3],zmm16[5],zmm10[5],zmm16[7],zmm10[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm20[0],zmm9[2],zmm20[2],zmm9[4],zmm20[4],zmm9[6],zmm20[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm20[1],zmm9[3],zmm20[3],zmm9[5],zmm20[5],zmm9[7],zmm20[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r10), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rax), %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r8), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%r9), %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm4 = zmm5[0],zmm2[0],zmm5[2],zmm2[2],zmm5[4],zmm2[4],zmm5[6],zmm2[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm14, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm15, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm4, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm17[1],zmm16[1],zmm17[3],zmm16[3],zmm17[5],zmm16[5],zmm17[7],zmm16[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm13, %zmm8 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm11, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm3, %zmm6 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm11, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm6 {%k1} = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm15, %zmm5 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm13, %zmm6 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm3, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm27, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm15, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm13, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %zmm4 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm4, %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm27, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r10), %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rax), %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r8), %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%r9), %zmm15 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm13[0],zmm15[0],zmm13[2],zmm15[2],zmm13[4],zmm15[4],zmm13[6],zmm15[6] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm24, %zmm9, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm5 = zmm13[1],zmm15[1],zmm13[3],zmm15[3],zmm13[5],zmm15[5],zmm13[7],zmm15[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm12, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm11, %zmm3 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm26[0],zmm16[0],zmm26[2],zmm16[2],zmm26[4],zmm16[4],zmm26[6],zmm16[6] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 +; AVX512DQBW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm26[1],zmm16[1],zmm26[3],zmm16[3],zmm26[5],zmm16[5],zmm26[7],zmm16[7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm14, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm12, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm11, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r10), %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rax), %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r8), %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%r9), %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm31 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm11, %zmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r10), %zmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rax), %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm1 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm14, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %zmm1 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm2 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm4, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r8), %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%r9), %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm11, %zmm30 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm1, %zmm5, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm1, %zmm0, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm24 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm0, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm0, %zmm27 +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,8,0,8,0,8,0,8] +; AVX512DQBW-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [1,9,1,9,1,9,1,9] +; AVX512DQBW-FAST-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] +; AVX512DQBW-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [3,11,3,11,3,11,3,11] +; AVX512DQBW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm19, %zmm1, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm21 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm23, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm0, %zmm1, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm19 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm18, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm14 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm20, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm2, %zmm11 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm5, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm6, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm22, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm5, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm6, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm2, %zmm10 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm4, %zmm1, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm2, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm16, %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm22 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm5, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm23 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm6, %zmm23 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm2, %zmm7 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm15, %zmm1, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm31 {%k1} = zmm29[0],zmm3[0],zmm29[2],zmm3[2],zmm29[4],zmm3[4],zmm29[6],zmm3[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm24 {%k1} = zmm29[1],zmm3[1],zmm29[3],zmm3[3],zmm29[5],zmm3[5],zmm29[7],zmm3[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm2, %zmm20 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm3, %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm15 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm5, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm16 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm6, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, %zmm18 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm2, %zmm18 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm25[0],zmm12[0],zmm25[2],zmm12[2],zmm25[4],zmm12[4],zmm25[6],zmm12[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm4 = zmm25[1],zmm12[1],zmm25[3],zmm12[3],zmm25[5],zmm12[5],zmm25[7],zmm12[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm12, %zmm1, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm5, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm0 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm6, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm30 {%k1} = zmm28[0],zmm9[0],zmm28[2],zmm9[2],zmm28[4],zmm9[4],zmm28[6],zmm9[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm27 {%k1} = zmm28[1],zmm9[1],zmm28[3],zmm9[3],zmm28[5],zmm9[5],zmm28[7],zmm9[7] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm2, %zmm12 +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm9, %zmm1, %zmm28 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm5 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm6 +; AVX512DQBW-FAST-NEXT: vpermi2q %zmm8, %zmm17, %zmm2 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} zmm9 = zmm17[0],zmm8[0],zmm17[2],zmm8[2],zmm17[4],zmm8[4],zmm17[6],zmm8[6] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm17[1],zmm8[1],zmm17[3],zmm8[3],zmm17[5],zmm8[5],zmm17[7],zmm8[7] +; AVX512DQBW-FAST-NEXT: vpermt2q %zmm8, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm3 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm31, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm9, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512DQBW-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm21 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm9[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm21, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %ymm4 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm8 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm9[2,3],ymm8[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm8, %zmm19, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 (%rsp), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm14 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rsi), %ymm8 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm14, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rsi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm11, %zmm24 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm21 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rcx), %ymm0 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdx), %ymm1 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rsi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm8[0],ymm4[0],ymm8[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm14[2,3],ymm3[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm10, %zmm19 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, %zmm9 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm8[1],ymm4[1],ymm8[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm10 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdx), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rsi), %ymm4 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %ymm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm9 = ymm14[0],ymm4[0],ymm14[2],ymm4[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm8 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, %zmm13 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm13 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm18 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdx), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rsi), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm14[2,3],ymm4[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm18 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, %zmm25 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm25 +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, %zmm2 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rcx), %ymm1 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdx), %ymm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rsi), %ymm7 +; AVX512DQBW-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm12[2,3],ymm4[2,3] +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm4 +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm9[1],ymm7[1],ymm9[3],ymm7[3] +; AVX512DQBW-FAST-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm28, %zmm17 {%k1} +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm9 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; AVX512DQBW-FAST-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQBW-FAST-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm14 = xmm12[0],xmm7[0] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm1, %ymm14, %ymm1 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm11 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm12[1],xmm7[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm7 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rdx), %xmm12 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm12[0],xmm2[0] +; AVX512DQBW-FAST-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 64(%rdi), %xmm17 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm20 = xmm17[0],xmm14[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm3, %ymm20, %ymm3 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm3 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm12[1],xmm2[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm17[1],xmm14[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm2 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rcx), %xmm12 +; AVX512DQBW-FAST-NEXT: vmovdqa 128(%rdx), %xmm14 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm17 = xmm14[0],xmm12[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rsi), %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 128(%rdi), %xmm26 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm28 = xmm26[0],xmm20[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm17, %ymm28, %ymm17 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm17, %zmm1, %zmm17 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm14[1],xmm12[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm26[1],xmm20[1] +; AVX512DQBW-FAST-NEXT: vinserti128 $1, %xmm12, %ymm14, %ymm12 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm12, %zmm1, %zmm12 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa 192(%rcx), %xmm14 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdx), %xmm20 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm26 = xmm20[0],xmm14[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rsi), %xmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 192(%rdi), %xmm29 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm29[0],xmm28[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm26, %ymm30, %ymm26 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm26, %zmm1, %zmm26 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm20[1],xmm14[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm29[1],xmm28[1] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm14, %ymm20, %ymm14 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm14, %zmm1, %zmm14 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rcx), %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rdx), %xmm28 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm29 = xmm28[0],xmm20[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 256(%rsi), %xmm30 +; AVX512DQBW-FAST-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm0[0],xmm30[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm29, %ymm31, %ymm29 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm29, %zmm1, %zmm29 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm30[1] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm1 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rcx), %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rdx), %xmm28 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm30 = xmm28[0],xmm20[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 320(%rsi), %xmm31 +; AVX512DQBW-FAST-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm0[0],xmm31[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm30, %ymm27, %ymm27 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm22, %zmm22 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm23 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm28[1],xmm20[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm31[1] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm0 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm15 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rcx), %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdx), %xmm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rsi), %xmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 384(%rdi), %xmm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm15, %zmm15 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm16 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm16, %zmm16 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm20, %zmm5 {%k1} +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rcx), %xmm20 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdx), %xmm23 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm27 = xmm23[0],xmm20[0] +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rsi), %xmm28 +; AVX512DQBW-FAST-NEXT: vmovdqa64 448(%rdi), %xmm30 +; AVX512DQBW-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm31 = xmm30[0],xmm28[0] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm27, %ymm31, %ymm27 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm27, %zmm5, %zmm5 +; AVX512DQBW-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm27, %zmm6 {%k1} +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm20 = xmm23[1],xmm20[1] +; AVX512DQBW-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm23 = xmm30[1],xmm28[1] +; AVX512DQBW-FAST-NEXT: vinserti32x4 $1, %xmm20, %ymm23, %ymm20 +; AVX512DQBW-FAST-NEXT: vinserti64x4 $0, %ymm20, %zmm6, %zmm6 +; AVX512DQBW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm9, 3776(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm4, 3712(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm25, 3264(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm18, 3200(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm13, 2752(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm8, 2688(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm10, 2240(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm19, 2176(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm21, 1728(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm24, 1664(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1216(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 1152(%rax) +; AVX512DQBW-FAST-NEXT: vmovups (%rsp), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 704(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 640(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 192(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 128(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 4032(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3968(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3904(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3840(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm6, 3648(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm5, 3584(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3520(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3456(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3392(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3328(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm16, 3136(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm15, 3072(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 3008(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2944(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2880(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm4, 2816(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm0, 2624(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm22, 2560(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2496(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2432(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2368(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 2304(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm1, 2112(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm29, 2048(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1984(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1920(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1856(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1792(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm14, 1600(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm26, 1536(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1472(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1408(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1344(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 1280(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm12, 1088(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm17, 1024(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm3, 512(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 448(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 384(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 320(%rax) +; AVX512DQBW-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQBW-FAST-NEXT: vmovaps %zmm0, 256(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm7, 64(%rax) +; AVX512DQBW-FAST-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQBW-FAST-NEXT: addq $5384, %rsp # imm = 0x1508 +; AVX512DQBW-FAST-NEXT: vzeroupper +; AVX512DQBW-FAST-NEXT: retq %in.vec0 = load <64 x i64>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i64>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i64>, ptr %in.vecptr2, align 64 @@ -10981,16 +21419,8 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FAST-PERLANE: {{.*}} ; AVX2-SLOW: {{.*}} ; AVX512BW-FAST: {{.*}} -; AVX512BW-ONLY-FAST: {{.*}} -; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512BW-SLOW: {{.*}} -; AVX512DQ-FAST: {{.*}} -; AVX512DQ-SLOW: {{.*}} -; AVX512DQBW-FAST: {{.*}} -; AVX512DQBW-SLOW: {{.*}} ; AVX512F-FAST: {{.*}} -; AVX512F-ONLY-FAST: {{.*}} -; AVX512F-ONLY-SLOW: {{.*}} ; AVX512F-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 17ed73d9e3b16..164d8bef447a0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -511,7 +511,8 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -537,7 +538,8 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -992,7 +994,8 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm5 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -1038,7 +1041,8 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX512F-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX512F-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vpshufb %ymm7, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 04bc4e5da890f..4f2ee5d5985b0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -342,7 +342,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-SLOW-NEXT: shrq $48, %rax ; AVX2-SLOW-NEXT: vmovd %eax, %xmm1 ; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <255,255,0,255,255,255,255,0,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] ; AVX2-SLOW-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) @@ -374,7 +374,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FAST-NEXT: shrq $48, %rax ; AVX2-FAST-NEXT: vmovd %eax, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <255,255,0,255,255,255,255,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) @@ -406,7 +406,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FAST-PERLANE-NEXT: shrq $48, %rax ; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = <255,255,0,255,255,255,255,0,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) @@ -1592,7 +1592,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255,255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255> +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] +; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u> @@ -3293,12 +3294,14 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm10, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm12, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm14 @@ -3336,7 +3339,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <3,3,3,u,4,4,4,4> ; AVX2-SLOW-NEXT: vpermd %ymm15, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm10, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 @@ -3393,7 +3396,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: # xmm5 = mem[0,0,1,1] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm1, %ymm5, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm6 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm14[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7] @@ -3404,7 +3407,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 ; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] @@ -3505,7 +3508,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu %ymm13, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] @@ -3515,7 +3519,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm4, %ymm5, %ymm4 @@ -3547,11 +3552,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm14 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] ; AVX2-FAST-NEXT: vpor %ymm7, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25] @@ -3579,7 +3586,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4> ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm2 @@ -3715,7 +3722,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm4, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] @@ -3725,7 +3733,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm4, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm7, %ymm6 @@ -3763,7 +3772,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm12, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm14, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3] @@ -3793,7 +3803,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4> ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm9, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm11 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4 @@ -3915,7 +3925,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm23 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,zero,ymm7[26],zero,ymm7[28],zero,ymm7[30],zero,zero,ymm7[29],zero,ymm7[31],zero,zero -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm15 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm31 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm18 @@ -3959,12 +3970,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpandnq %ymm27, %ymm30, %ymm27 ; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm4, %ymm13 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm13, %zmm27 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm14, %ymm14 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm31, %ymm9 ; AVX512F-SLOW-NEXT: vpshufb %ymm9, %ymm12, %ymm13 ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm7, %ymm7 ; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm6, %ymm12 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero,zero @@ -4146,11 +4158,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpandnq %ymm28, %ymm29, %ymm28 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm28, %zmm3, %zmm28 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm14 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm12[19],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm7, %ymm12 @@ -4234,193 +4247,195 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; -; AVX512BW-SLOW-LABEL: store_i8_stride5_vf64: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u,9,u,11,u,u,10,u,12,u,14,u,u,13,u,15,u> -; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 -; AVX512BW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> -; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm16 = <11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14,11,u,u,10,u,12,u,u,u,u,13,u,15,u,u,14> -; AVX512BW-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 -; AVX512BW-SLOW-NEXT: kmovd %eax, %k5 -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5} -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512BW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k4 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4} -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17 -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512BW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19 -; AVX512BW-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> -; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6 -; AVX512BW-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> -; AVX512BW-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> -; AVX512BW-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13 -; AVX512BW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4> -; AVX512BW-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512BW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 -; AVX512BW-SLOW-NEXT: kmovd %eax, %k2 -; AVX512BW-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 -; AVX512BW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4> -; AVX512BW-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18 -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 -; AVX512BW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k6 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm17 = -; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512BW-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 -; AVX512BW-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512BW-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = -; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] -; AVX512BW-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26 -; AVX512BW-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 -; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 -; AVX512BW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512BW-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 -; AVX512BW-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20 -; AVX512BW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 -; AVX512BW-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19 -; AVX512BW-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20 -; AVX512BW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] -; AVX512BW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm7 -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 -; AVX512BW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} -; AVX512BW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7 -; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1 -; AVX512BW-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2} -; AVX512BW-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2 -; AVX512BW-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX512BW-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3} -; AVX512BW-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0 -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 -; AVX512BW-SLOW-NEXT: kmovq %rax, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq +; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride5_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512BW-ONLY-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4> +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512BW-ONLY-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %eax, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4> +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX512BW-ONLY-SLOW-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512BW-ONLY-SLOW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 +; AVX512BW-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20 +; AVX512BW-ONLY-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] +; AVX512BW-ONLY-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512BW-ONLY-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0 +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rax, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9) +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride5_vf64: ; AVX512BW-FAST: # %bb.0: @@ -4457,7 +4472,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4> ; AVX512BW-FAST-NEXT: vpermd %ymm21, %ymm3, %ymm22 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rsi), %ymm23 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512BW-FAST-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FAST-NEXT: kmovd %eax, %k1 ; AVX512BW-FAST-NEXT: vpshufb %ymm4, %ymm23, %ymm22 {%k1} @@ -4576,6 +4591,196 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 192(%r9) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i8_stride5_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: movl $693250386, %eax # imm = 0x29522952 +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm16 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: movl $1251232404, %eax # imm = 0x4A944A94 +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k5 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm9, %ymm4 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 +; AVX512DQBW-SLOW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k4 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm5, %zmm4 {%k4} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %xmm17 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm11 = xmm17[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm23 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm23, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm24 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm24, %ymm19 +; AVX512DQBW-SLOW-NEXT: vporq %ymm18, %ymm19, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm19 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm12, %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm20 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm6, %xmm6 +; AVX512DQBW-SLOW-NEXT: vpor %xmm6, %xmm12, %xmm6 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm21 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm14, %xmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} xmm22 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128> +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm13, %xmm13 +; AVX512DQBW-SLOW-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4> +; AVX512DQBW-SLOW-NEXT: vpermd %ymm25, %ymm12, %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm26 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512DQBW-SLOW-NEXT: movl $138547332, %eax # imm = 0x8421084 +; AVX512DQBW-SLOW-NEXT: kmovd %eax, %k2 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm26, %ymm18 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm14 +; AVX512DQBW-SLOW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm14, %zmm6 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <3,3,3,3,u,4,4,4> +; AVX512DQBW-SLOW-NEXT: vpermd %ymm15, %ymm14, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm17 = xmm17[1,1,2,2] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm17, %zmm17 +; AVX512DQBW-SLOW-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k6 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm17, %zmm6 {%k6} +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm17 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128] +; AVX512DQBW-SLOW-NEXT: # ymm17 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm26, %ymm18 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm18[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm25, %ymm28 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vporq %ymm27, %ymm28, %ymm27 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm16, %ymm26, %ymm16 +; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm25 = ymm25[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm25 = ymm25[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm25, %ymm16 {%k5} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm27, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm23, %ymm26 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm27 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512DQBW-SLOW-NEXT: # ymm27 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm24, %ymm28 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vporq %ymm26, %ymm28, %ymm26 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 +; AVX512DQBW-SLOW-NEXT: vpshufhw {{.*#+}} ymm23 = ymm23[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm23 = ymm23[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %ymm23, %ymm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm26, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k4} +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm16 = ymm15[0,2,1,1,4,6,5,5] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,3,2] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,3,3,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512DQBW-SLOW-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm19, %xmm15, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm20, %xmm16, %xmm20 +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3],xmm15[4],xmm16[4],xmm15[5],xmm16[5],xmm15[6],xmm16[6],xmm15[7],xmm16[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %xmm16 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm21, %xmm16, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm22, %xmm15, %xmm20 +; AVX512DQBW-SLOW-NEXT: vporq %xmm19, %xmm20, %xmm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,1,4,4,5,5] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] +; AVX512DQBW-SLOW-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,0,1,1] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 +; AVX512DQBW-SLOW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm25, %ymm1, %ymm7 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm0, %ymm15 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpor %ymm7, %ymm15, %ymm7 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: vpermd %ymm3, %ymm12, %ymm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm13, %ymm2, %ymm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm17, %ymm2, %ymm2 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm18, %ymm3, %ymm3 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX512DQBW-SLOW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vpermd %ymm5, %ymm14, %ymm0 +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,2,1,1,4,6,5,5] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQBW-SLOW-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 +; AVX512DQBW-SLOW-NEXT: kmovq %rax, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, 64(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm10, (%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm8, 256(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm6, 192(%r9) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm4, 128(%r9) +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 @@ -4595,11 +4800,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: {{.*}} ; AVX512: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} -; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} -; AVX512DQBW-SLOW: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index e8ca42820850f..bb052c6fa70d9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -1286,7 +1286,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa %xmm6, %xmm7 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 @@ -1294,7 +1294,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 @@ -1308,12 +1308,12 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm12 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm5 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[16],ymm12[16],ymm8[17],ymm12[17],ymm8[18],ymm12[18],ymm8[19],ymm12[19],ymm8[20],ymm12[20],ymm8[21],ymm12[21],ymm8[22],ymm12[22],ymm8[23],ymm12[23] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm15 ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm1 @@ -1423,14 +1423,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm9 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm7 @@ -1445,11 +1445,11 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm12 ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm10 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm15 ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] @@ -1554,14 +1554,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm5, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm7, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm7 @@ -1576,11 +1576,11 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm10[0],ymm12[0],ymm10[1],ymm12[1],ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[4],ymm12[4],ymm10[5],ymm12[5],ymm10[6],ymm12[6],ymm10[7],ymm12[7],ymm10[16],ymm12[16],ymm10[17],ymm12[17],ymm10[18],ymm12[18],ymm10[19],ymm12[19],ymm10[20],ymm12[20],ymm10[21],ymm12[21],ymm10[22],ymm12[22],ymm10[23],ymm12[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] @@ -1717,7 +1717,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm7 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm12, %ymm15 ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm10, %ymm14 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23] @@ -1729,7 +1729,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm16 ; AVX512F-SLOW-NEXT: vpandq %zmm16, %zmm14, %zmm14 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm13, %ymm13 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm11, %ymm0 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm13[0],ymm0[1],ymm13[1],ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[4],ymm13[4],ymm0[5],ymm13[5],ymm0[6],ymm13[6],ymm0[7],ymm13[7],ymm0[16],ymm13[16],ymm0[17],ymm13[17],ymm0[18],ymm13[18],ymm0[19],ymm13[19],ymm0[20],ymm13[20],ymm0[21],ymm13[21],ymm0[22],ymm13[22],ymm0[23],ymm13[23] @@ -1738,7 +1738,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vpternlogq $186, %zmm14, %zmm15, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512F-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm10 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23] @@ -1747,7 +1747,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] @@ -1755,7 +1755,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vprold $16, %xmm9, %xmm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -1765,7 +1765,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -1820,7 +1820,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm15 ; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm14 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm14 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[16],ymm15[16],ymm14[17],ymm15[17],ymm14[18],ymm15[18],ymm14[19],ymm15[19],ymm14[20],ymm15[20],ymm14[21],ymm15[21],ymm14[22],ymm15[22],ymm14[23],ymm15[23] @@ -1832,7 +1832,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm15, %zmm15, %zmm16 ; AVX512F-FAST-NEXT: vpandq %zmm16, %zmm14, %zmm14 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm13 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm13[0],ymm2[1],ymm13[1],ymm2[2],ymm13[2],ymm2[3],ymm13[3],ymm2[4],ymm13[4],ymm2[5],ymm13[5],ymm2[6],ymm13[6],ymm2[7],ymm13[7],ymm2[16],ymm13[16],ymm2[17],ymm13[17],ymm2[18],ymm13[18],ymm2[19],ymm13[19],ymm2[20],ymm13[20],ymm2[21],ymm13[21],ymm2[22],ymm13[22],ymm2[23],ymm13[23] @@ -1841,7 +1841,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] ; AVX512F-FAST-NEXT: vpternlogq $186, %zmm14, %zmm15, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm11 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm10 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[4],ymm11[4],ymm10[5],ymm11[5],ymm10[6],ymm11[6],ymm10[7],ymm11[7],ymm10[16],ymm11[16],ymm10[17],ymm11[17],ymm10[18],ymm11[18],ymm10[19],ymm11[19],ymm10[20],ymm11[20],ymm10[21],ymm11[21],ymm10[22],ymm11[22],ymm10[23],ymm11[23] @@ -1850,7 +1850,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm9 ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] @@ -1858,7 +1858,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm9, %zmm2 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX512F-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] @@ -1867,7 +1867,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm2 ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] @@ -1892,14 +1892,14 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %ymm1 ; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm7 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm8 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512BW-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6 ; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] ; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm12 ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 @@ -1920,7 +1920,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm11, %ymm10 {%k2} ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm10[0,1,2,3],zmm6[4,5,6,7] ; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm10 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm11 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm13 ; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm14 ; AVX512BW-SLOW-NEXT: vpshufb %xmm11, %xmm14, %xmm11 @@ -1962,19 +1962,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-SLOW-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1} ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm5, %ymm5 ; AVX512BW-SLOW-NEXT: vpshufb %ymm9, %ymm4, %ymm4 ; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX512BW-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512BW-SLOW-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} ; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm4 ; AVX512BW-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm3 ; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] @@ -2013,12 +2013,12 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: movw $9362, %cx # imm = 0x2492 ; AVX512BW-FAST-NEXT: kmovd %ecx, %k2 ; AVX512BW-FAST-NEXT: vpermw %ymm6, %ymm12, %ymm11 {%k2} -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm12 ; AVX512BW-FAST-NEXT: vpshufb %xmm6, %xmm10, %xmm6 ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm13 ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm12 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] @@ -2029,7 +2029,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm6 ; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm11[0,1,2,3],zmm6[4,5,6,7] ; AVX512BW-FAST-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm11, %xmm13 ; AVX512BW-FAST-NEXT: vmovdqa (%r8), %xmm14 ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm12 @@ -2070,19 +2070,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-FAST-NEXT: vpermw %ymm8, %ymm10, %ymm9 {%k1} ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm8 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm5 ; AVX512BW-FAST-NEXT: vpshufb %ymm9, %ymm4, %ymm4 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm3 ; AVX512BW-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512BW-FAST-NEXT: vmovdqu16 %ymm4, %ymm2 {%k1} ; AVX512BW-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm8[4,5,6,7] -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u,6,5,8,7,u,9,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm4 ; AVX512BW-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm3 ; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] @@ -3107,7 +3107,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3117,7 +3117,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm14 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm13 @@ -3140,7 +3140,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] @@ -3148,7 +3148,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] @@ -3186,7 +3186,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u> +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm6 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm10 @@ -3212,7 +3213,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm3, %ymm2 @@ -3281,7 +3283,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm10 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u> +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm14 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] @@ -3301,7 +3304,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm10, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm6, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -3343,10 +3347,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = ymm2[8],mem[8],ymm2[9],mem[9],ymm2[10],mem[10],ymm2[11],mem[11],ymm2[12],mem[12],ymm2[13],mem[13],ymm2[14],mem[14],ymm2[15],mem[15],ymm2[24],mem[24],ymm2[25],mem[25],ymm2[26],mem[26],ymm2[27],mem[27],ymm2[28],mem[28],ymm2[29],mem[29],ymm2[30],mem[30],ymm2[31],mem[31] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm14, %ymm14 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm14, %ymm10 @@ -3366,7 +3372,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u> +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm10, %ymm4, %ymm4 @@ -3383,7 +3390,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm6, %ymm8 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm8, %ymm4 @@ -3425,7 +3433,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3436,7 +3444,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3459,7 +3467,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] @@ -3467,7 +3475,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] @@ -3504,7 +3512,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u> +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 @@ -3530,7 +3539,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 @@ -3565,13 +3575,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1 @@ -3595,7 +3607,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u> +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] @@ -3615,7 +3628,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm10, %ymm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -3657,10 +3671,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm15, %ymm15 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 @@ -3681,7 +3697,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u> +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 @@ -3697,7 +3714,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 @@ -3739,7 +3757,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3750,7 +3768,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3773,7 +3791,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[16],ymm4[16],ymm5[17],ymm4[17],ymm5[18],ymm4[18],ymm5[19],ymm4[19],ymm5[20],ymm4[20],ymm5[21],ymm4[21],ymm5[22],ymm4[22],ymm5[23],ymm4[23] @@ -3781,7 +3799,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rsi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] @@ -3818,7 +3836,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r8), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 @@ -3844,7 +3863,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 @@ -3879,13 +3899,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1 @@ -3909,7 +3931,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm15, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u> +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm9, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] @@ -3929,7 +3952,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm10, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] @@ -3971,10 +3995,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm2[0],mem[0],ymm2[1],mem[1],ymm2[2],mem[2],ymm2[3],mem[3],ymm2[4],mem[4],ymm2[5],mem[5],ymm2[6],mem[6],ymm2[7],mem[7],ymm2[16],mem[16],ymm2[17],mem[17],ymm2[18],mem[18],ymm2[19],mem[19],ymm2[20],mem[20],ymm2[21],mem[21],ymm2[22],mem[22],ymm2[23],mem[23] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25,18,19,16,17,22,23,20,21,24,25,24,25,24,25,24,25] +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm15, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm15, %ymm5 @@ -3995,7 +4021,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm4 @@ -4011,7 +4038,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm4, %ymm5, %ymm4 @@ -4056,7 +4084,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm7 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm13 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm2, %xmm1 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] @@ -4087,30 +4115,34 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm13, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb %ymm11, %ymm4, %ymm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm4[8],ymm13[8],ymm4[9],ymm13[9],ymm4[10],ymm13[10],ymm4[11],ymm13[11],ymm4[12],ymm13[12],ymm4[13],ymm13[13],ymm4[14],ymm13[14],ymm4[15],ymm13[15],ymm4[24],ymm13[24],ymm4[25],ymm13[25],ymm4[26],ymm13[26],ymm4[27],ymm13[27],ymm4[28],ymm13[28],ymm4[29],ymm13[29],ymm4[30],ymm13[30],ymm4[31],ymm13[31] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 ; AVX512F-SLOW-NEXT: vmovdqa %ymm13, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm6, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb %ymm13, %ymm7, %ymm1 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15],ymm7[24],ymm6[24],ymm7[25],ymm6[25],ymm7[26],ymm6[26],ymm7[27],ymm6[27],ymm7[28],ymm6[28],ymm7[29],ymm6[29],ymm7[30],ymm6[30],ymm7[31],ymm6[31] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-SLOW-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm14, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm10, %ymm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm10, %ymm15 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm25 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %ymm15 @@ -4165,11 +4197,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm12, %ymm12 ; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm3, %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm10, %xmm0, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] @@ -4196,13 +4229,16 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm16 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u> +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa 32(%r8), %ymm0 ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u> +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm3, %ymm4 ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4334,32 +4370,36 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm7 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 ; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm7, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm8, %ymm1 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15],ymm8[24],ymm7[24],ymm8[25],ymm7[25],ymm8[26],ymm7[26],ymm8[27],ymm7[27],ymm8[28],ymm7[28],ymm8[29],ymm7[29],ymm8[30],ymm7[30],ymm8[31],ymm7[31] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm26 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm27 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm6, %ymm1 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15],ymm6[24],ymm2[24],ymm6[25],ymm2[25],ymm6[26],ymm2[26],ymm6[27],ymm2[27],ymm6[28],ymm2[28],ymm6[29],ymm2[29],ymm6[30],ymm2[30],ymm6[31],ymm2[31] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm28 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vmovdqa %ymm12, %ymm6 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm12, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm12, %ymm12 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm6, %ymm30 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm12, %zmm2 @@ -4387,7 +4427,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] @@ -4439,7 +4479,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 @@ -4447,25 +4488,29 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm12 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm11, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512F-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u> +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm7 ; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u,10,u,13,u,12,u,11,u,14,u,u,u,u,u,15,u> +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm15 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm15, %ymm17 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u> +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] +; AVX512F-FAST-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm15 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm6 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm1 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -4599,221 +4644,223 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; -; AVX512BW-SLOW-LABEL: store_i8_stride6_vf64: -; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] -; AVX512BW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0 -; AVX512BW-SLOW-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512BW-SLOW-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] -; AVX512BW-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] -; AVX512BW-SLOW-NEXT: vprold $16, %ymm6, %ymm6 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512BW-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924 -; AVX512BW-SLOW-NEXT: kmovd %r10d, %k1 -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1} -; AVX512BW-SLOW-NEXT: vmovdqa (%r8), %xmm5 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] -; AVX512BW-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r8), %ymm21 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u> -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512BW-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249 -; AVX512BW-SLOW-NEXT: kmovd %r10d, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa (%r9), %xmm6 -; AVX512BW-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = -; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 -; AVX512BW-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 -; AVX512BW-SLOW-NEXT: kmovq %r10, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3} -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 -; AVX512BW-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23] -; AVX512BW-SLOW-NEXT: vprold $16, %ymm30, %ymm30 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30 -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2} -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 -; AVX512BW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23 -; AVX512BW-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3} -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20 -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm27 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> -; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20 -; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = -; AVX512BW-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7] -; AVX512BW-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 -; AVX512BW-SLOW-NEXT: kmovd %ecx, %k2 -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} -; AVX512BW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7] -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = -; AVX512BW-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7] -; AVX512BW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 -; AVX512BW-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23 -; AVX512BW-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] -; AVX512BW-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17 -; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16 -; AVX512BW-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] -; AVX512BW-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14 -; AVX512BW-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2} -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12 -; AVX512BW-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] -; AVX512BW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512BW-SLOW-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} -; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 -; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] -; AVX512BW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 -; AVX512BW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8 -; AVX512BW-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 -; AVX512BW-SLOW-NEXT: kmovq %rcx, %k3 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3} -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8 -; AVX512BW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 -; AVX512BW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3 -; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8 -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512BW-SLOW-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] -; AVX512BW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2} -; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] -; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 -; AVX512BW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1} -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512BW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2 -; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) -; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-SLOW-NEXT: vzeroupper -; AVX512BW-SLOW-NEXT: retq +; AVX512BW-ONLY-SLOW-LABEL: store_i8_stride6_vf64: +; AVX512BW-ONLY-SLOW: # %bb.0: +; AVX512BW-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm6, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512BW-ONLY-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k1 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r8), %ymm21 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-ONLY-SLOW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512BW-ONLY-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %r10d, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa (%r9), %xmm6 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 (%r9), %ymm22 +; AVX512BW-ONLY-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-ONLY-SLOW-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %r10, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %ymm30, %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 +; AVX512BW-ONLY-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 +; AVX512BW-ONLY-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm8, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8 +; AVX512BW-ONLY-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 +; AVX512BW-ONLY-SLOW-NEXT: kmovq %rcx, %k3 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX512BW-ONLY-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512BW-ONLY-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3 +; AVX512BW-ONLY-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8 +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512BW-ONLY-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512BW-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2} +; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1} +; AVX512BW-ONLY-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-ONLY-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2 +; AVX512BW-ONLY-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-ONLY-SLOW-NEXT: vzeroupper +; AVX512BW-ONLY-SLOW-NEXT: retq ; ; AVX512BW-FAST-LABEL: store_i8_stride6_vf64: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%r8), %zmm9 ; AVX512BW-FAST-NEXT: vmovdqa64 (%r9), %zmm10 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm3 @@ -4824,7 +4871,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpermw %ymm3, %ymm8, %ymm3 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u,5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %ymm6 ; AVX512BW-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm4 @@ -4884,7 +4931,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqu8 %zmm10, %zmm7 {%k3} ; AVX512BW-FAST-NEXT: vmovdqa64 (%rcx), %xmm21 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm23 = <5,8,7,6,9,u,u,10,u,u,u,u,u,u,u,u> +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm23 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX512BW-FAST-NEXT: vpshufb %xmm23, %xmm12, %xmm10 ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdx), %xmm22 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rdx), %xmm14 @@ -4896,7 +4943,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpermw %ymm15, %ymm24, %ymm15 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm16 ; AVX512BW-FAST-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} xmm25 = +; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm25 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm15, %xmm10 ; AVX512BW-FAST-NEXT: vmovdqa64 32(%rdi), %xmm19 ; AVX512BW-FAST-NEXT: vpshufb %xmm25, %xmm19, %xmm20 @@ -4972,7 +5019,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vpermw %zmm5, %zmm18, %zmm1 {%k1} ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] ; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u,2,u,1,u,0,u,3,u,u,u,u,u,4,u,u,u> +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-FAST-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm8 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 @@ -4981,7 +5029,8 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqu16 %zmm2, %zmm11 {%k1} ; AVX512BW-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm26[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FAST-NEXT: vpermw %ymm2, %ymm5, %ymm2 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 @@ -5009,6 +5058,217 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq +; +; AVX512DQBW-SLOW-LABEL: store_i8_stride6_vf64: +; AVX512DQBW-SLOW: # %bb.0: +; AVX512DQBW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %zmm14 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %zmm12 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rsi), %ymm16 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,24,27,26,25,24,27,26,25,24,27,26,25,28,29,30,29] +; AVX512DQBW-SLOW-NEXT: vpermw %zmm0, %zmm7, %zmm0 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm5, %ymm20, %ymm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rcx), %ymm18 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%rdx), %ymm19 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm19[0],ymm18[0],ymm19[1],ymm18[1],ymm19[2],ymm18[2],ymm19[3],ymm18[3],ymm19[4],ymm18[4],ymm19[5],ymm18[5],ymm19[6],ymm18[6],ymm19[7],ymm18[7],ymm19[16],ymm18[16],ymm19[17],ymm18[17],ymm19[18],ymm18[18],ymm19[19],ymm18[19],ymm19[20],ymm18[20],ymm19[21],ymm18[21],ymm19[22],ymm18[22],ymm19[23],ymm18[23] +; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm6, %ymm6 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQBW-SLOW-NEXT: movl $613566756, %r10d # imm = 0x24924924 +; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k1 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k1} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r8), %xmm5 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r8), %xmm13 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm6, %ymm23, %ymm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r8), %ymm21 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQBW-SLOW-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm21, %ymm15 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQBW-SLOW-NEXT: movl $1227133513, %r10d # imm = 0x49249249 +; AVX512DQBW-SLOW-NEXT: kmovd %r10d, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm6, %zmm0 {%k2} +; AVX512DQBW-SLOW-NEXT: vmovdqa (%r9), %xmm6 +; AVX512DQBW-SLOW-NEXT: vmovdqa 32(%r9), %xmm15 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm22, %ymm23, %ymm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 (%r9), %ymm22 +; AVX512DQBW-SLOW-NEXT: vbroadcasti64x2 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512DQBW-SLOW-NEXT: # ymm26 = mem[0,1,0,1] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm22, %ymm27 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm27 = ymm27[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm25 +; AVX512DQBW-SLOW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512DQBW-SLOW-NEXT: kmovq %r10, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm25, %zmm0 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rsi), %ymm25 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdi), %ymm27 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm28 = ymm27[0],ymm25[0],ymm27[1],ymm25[1],ymm27[2],ymm25[2],ymm27[3],ymm25[3],ymm27[4],ymm25[4],ymm27[5],ymm25[5],ymm27[6],ymm25[6],ymm27[7],ymm25[7],ymm27[16],ymm25[16],ymm27[17],ymm25[17],ymm27[18],ymm25[18],ymm27[19],ymm25[19],ymm27[20],ymm25[20],ymm27[21],ymm25[21],ymm27[22],ymm25[22],ymm27[23],ymm25[23] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm29, %zmm28 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm28, %zmm7, %zmm7 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm20, %ymm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rcx), %ymm28 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%rdx), %ymm29 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm29[0],ymm28[0],ymm29[1],ymm28[1],ymm29[2],ymm28[2],ymm29[3],ymm28[3],ymm29[4],ymm28[4],ymm29[5],ymm28[5],ymm29[6],ymm28[6],ymm29[7],ymm28[7],ymm29[16],ymm28[16],ymm29[17],ymm28[17],ymm29[18],ymm28[18],ymm29[19],ymm28[19],ymm29[20],ymm28[20],ymm29[21],ymm28[21],ymm29[22],ymm28[22],ymm29[23],ymm28[23] +; AVX512DQBW-SLOW-NEXT: vprold $16, %ymm30, %ymm30 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k1} +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r8), %ymm30 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm30, %ymm24 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm20, %zmm7 {%k2} +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm20, %ymm23, %ymm20 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 32(%r9), %ymm23 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm26, %ymm23, %ymm24 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm20, %zmm7 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm24 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm25, %ymm20 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm27, %ymm26 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm20[0],ymm26[1],ymm20[1],ymm26[2],ymm20[2],ymm26[3],ymm20[3],ymm26[4],ymm20[4],ymm26[5],ymm20[5],ymm26[6],ymm20[6],ymm26[7],ymm20[7],ymm26[16],ymm20[16],ymm26[17],ymm20[17],ymm26[18],ymm20[18],ymm26[19],ymm20[19],ymm26[20],ymm20[20],ymm26[21],ymm20[21],ymm26[22],ymm20[22],ymm26[23],ymm20[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm25 = ymm27[8],ymm25[8],ymm27[9],ymm25[9],ymm27[10],ymm25[10],ymm27[11],ymm25[11],ymm27[12],ymm25[12],ymm27[13],ymm25[13],ymm27[14],ymm25[14],ymm27[15],ymm25[15],ymm27[24],ymm25[24],ymm27[25],ymm25[25],ymm27[26],ymm25[26],ymm27[27],ymm25[27],ymm27[28],ymm25[28],ymm27[29],ymm25[29],ymm27[30],ymm25[30],ymm27[31],ymm25[31] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm25, %ymm26, %ymm25 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm20, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm27 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm28, %ymm20 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm29, %ymm31 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm31[0],ymm20[0],ymm31[1],ymm20[1],ymm31[2],ymm20[2],ymm31[3],ymm20[3],ymm31[4],ymm20[4],ymm31[5],ymm20[5],ymm31[6],ymm20[6],ymm31[7],ymm20[7],ymm31[16],ymm20[16],ymm31[17],ymm20[17],ymm31[18],ymm20[18],ymm31[19],ymm20[19],ymm31[20],ymm20[20],ymm31[21],ymm20[21],ymm31[22],ymm20[22],ymm31[23],ymm20[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm28 = ymm29[8],ymm28[8],ymm29[9],ymm28[9],ymm29[10],ymm28[10],ymm29[11],ymm28[11],ymm29[12],ymm28[12],ymm29[13],ymm28[13],ymm29[14],ymm28[14],ymm29[15],ymm28[15],ymm29[24],ymm28[24],ymm29[25],ymm28[25],ymm29[26],ymm28[26],ymm29[27],ymm28[27],ymm29[28],ymm28[28],ymm29[29],ymm28[29],ymm29[30],ymm28[30],ymm29[31],ymm28[31] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm29 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm28, %ymm29, %ymm28 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm20, %zmm20 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm30[0,1,2,3],zmm14[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm28 = +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm25, %zmm25 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm25 = zmm25[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: movl $-1840700270, %ecx # imm = 0x92492492 +; AVX512DQBW-SLOW-NEXT: kmovd %ecx, %k2 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k2} +; AVX512DQBW-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[0,1,2,3],zmm12[4,5,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm25 = +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm25, %zmm23, %zmm23 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm23, %zmm20 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm16, %ymm23 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm24, %ymm17, %ymm24 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm23 = ymm24[0],ymm23[0],ymm24[1],ymm23[1],ymm24[2],ymm23[2],ymm24[3],ymm23[3],ymm24[4],ymm23[4],ymm24[5],ymm23[5],ymm24[6],ymm23[6],ymm24[7],ymm23[7],ymm24[16],ymm23[16],ymm24[17],ymm23[17],ymm24[18],ymm23[18],ymm24[19],ymm23[19],ymm24[20],ymm23[20],ymm24[21],ymm23[21],ymm24[22],ymm23[22],ymm24[23],ymm23[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm16 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm16, %ymm26, %ymm16 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm23, %zmm17 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm18, %ymm16 +; AVX512DQBW-SLOW-NEXT: vpshufb %ymm27, %ymm19, %ymm23 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm16 = ymm23[0],ymm16[0],ymm23[1],ymm16[1],ymm23[2],ymm16[2],ymm23[3],ymm16[3],ymm23[4],ymm16[4],ymm23[5],ymm16[5],ymm23[6],ymm16[6],ymm23[7],ymm16[7],ymm23[16],ymm16[16],ymm23[17],ymm16[17],ymm23[18],ymm16[18],ymm23[19],ymm16[19],ymm23[20],ymm16[20],ymm23[21],ymm16[21],ymm23[22],ymm16[22],ymm23[23],ymm16[23] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} ymm18 = ymm19[8],ymm18[8],ymm19[9],ymm18[9],ymm19[10],ymm18[10],ymm19[11],ymm18[11],ymm19[12],ymm18[12],ymm19[13],ymm18[13],ymm19[14],ymm18[14],ymm19[15],ymm18[15],ymm19[24],ymm18[24],ymm19[25],ymm18[25],ymm19[26],ymm18[26],ymm19[27],ymm18[27],ymm19[28],ymm18[28],ymm19[29],ymm18[29],ymm19[30],ymm18[30],ymm19[31],ymm18[31] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm18, %ymm29, %ymm18 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm16 +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm17, %zmm16 {%k1} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm14, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm28, %zmm14, %zmm14 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm14, %zmm16 {%k2} +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm12, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpshufb %zmm25, %zmm12, %zmm12 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,2,2,3,6,6,6,7] +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm12, %zmm16 {%k3} +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm9, %xmm14 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm17 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; AVX512DQBW-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm9, %ymm11, %ymm9 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 +; AVX512DQBW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm17 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm10, %xmm18 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm8, %xmm8 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm8, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2} +; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm10, %zmm8, %zmm8 +; AVX512DQBW-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17] +; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm9 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm8, %zmm10, %zmm8 +; AVX512DQBW-SLOW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 +; AVX512DQBW-SLOW-NEXT: kmovq %rcx, %k3 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm8, %zmm9 {%k3} +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm8 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm12 +; AVX512DQBW-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,0,1] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQBW-SLOW-NEXT: vpermw %ymm1, %ymm11, %ymm1 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm1 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm3 +; AVX512DQBW-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm8 +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512DQBW-SLOW-NEXT: vprold $16, %xmm2, %xmm2 +; AVX512DQBW-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,0,1,4,4,4,5] +; AVX512DQBW-SLOW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k2} +; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm1 {%k1} +; AVX512DQBW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,1,2,3] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQBW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 +; AVX512DQBW-SLOW-NEXT: vpermw %zmm2, %zmm10, %zmm2 +; AVX512DQBW-SLOW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k3} +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm9, 192(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm20, 320(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm7, 256(%rax) +; AVX512DQBW-SLOW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQBW-SLOW-NEXT: vzeroupper +; AVX512DQBW-SLOW-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64 %in.vec2 = load <64 x i8>, ptr %in.vecptr2, align 64 @@ -5030,11 +5290,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: {{.*}} ; AVX512: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} -; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} -; AVX512DQBW-SLOW: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 844dc41240166..ac61b2bf50c34 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -2802,13 +2802,15 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u,255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u> +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0] +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0] +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u> @@ -3392,7 +3394,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512F-SLOW-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpand %ymm0, %ymm9, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] @@ -3406,7 +3409,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,1,1,4,4,5,5] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-SLOW-NEXT: # ymm11 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpand %ymm11, %ymm10, %ymm10 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] @@ -3545,7 +3549,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512F-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpand %ymm0, %ymm9, %ymm0 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] @@ -3575,7 +3580,8 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] ; AVX512F-FAST-NEXT: vporq %zmm0, %zmm11, %zmm0 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm11 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [5,5,4,0,5,5,4,0] +; AVX512F-FAST-NEXT: # ymm12 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermd %ymm11, %ymm12, %ymm11 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] @@ -6126,7 +6132,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] @@ -6158,7 +6165,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4 ; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] @@ -6170,7 +6177,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,1,1,4,4,5,5] ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm10, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] @@ -6626,7 +6634,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] @@ -6635,7 +6644,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm8, %ymm11, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] @@ -6658,7 +6668,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] ; AVX2-FAST-NEXT: vpor %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm9, %ymm10, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm15, %ymm11 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] @@ -7052,7 +7063,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm0, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] @@ -7061,7 +7073,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm6, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] @@ -7084,7 +7097,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm14, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3] @@ -7093,7 +7107,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm5, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpor %ymm10, %ymm12, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31] +; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm12, %ymm4, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2] @@ -7216,7 +7231,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm2, %ymm23 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <13,u,u,u,u,u,128,14,u,u,u,u,u,128,15,u,u,u,u,u,128,16,u,u,u,u,u,128,17,u,u,u> +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512F-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm3, %ymm29 ; AVX512F-SLOW-NEXT: vporq %ymm0, %ymm1, %ymm24 @@ -7231,7 +7247,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdx), %ymm9 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[30],zero,ymm9[28],zero,zero,zero,zero,ymm9[31],zero,ymm9[29],zero,zero ; AVX512F-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512F-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm1, %ymm8, %ymm2 ; AVX512F-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 @@ -7239,7 +7256,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm4, %ymm6, %ymm3 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -7361,7 +7379,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm26, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29],zero,zero ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14] ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm12 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm28 @@ -7379,7 +7397,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm10 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm13, %ymm26 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10,9,8,7,0,0,0,11,10] ; AVX512F-SLOW-NEXT: vmovdqa %ymm15, %ymm3 ; AVX512F-SLOW-NEXT: vpshufb %ymm12, %ymm15, %ymm15 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm23 @@ -7419,7 +7437,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm15[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u> +; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm15 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %ymm15, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7438,18 +7456,21 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa %ymm6, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero ; AVX512F-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm25, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa %ymm4, %ymm15 ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512F-SLOW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm15, %ymm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm4, %ymm28 ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -7493,7 +7514,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vporq %zmm4, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-SLOW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-SLOW-NEXT: vpand %ymm4, %ymm11, %ymm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 ; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload @@ -7625,438 +7647,885 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; -; AVX512F-FAST-LABEL: store_i8_stride7_vf64: -; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $1432, %rsp # imm = 0x598 -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512F-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %ymm2 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm15 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512F-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] -; AVX512F-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512F-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 -; AVX512F-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21 -; AVX512F-FAST-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512F-FAST-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm12, %xmm29 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512F-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512F-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 -; AVX512F-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX512F-FAST-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa (%rax), %xmm0 -; AVX512F-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] -; AVX512F-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa (%rax), %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm25 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; AVX512F-FAST-NEXT: vmovdqa %xmm14, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30 -; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u> -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %xmm10 -; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7 -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; AVX512F-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3] -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512F-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] -; AVX512F-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 -; AVX512F-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 -; AVX512F-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3 -; AVX512F-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 -; AVX512F-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11 -; AVX512F-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2 -; AVX512F-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512F-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512F-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1] -; AVX512F-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7] -; AVX512F-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero -; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8 -; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] -; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20 -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: # ymm21 = mem[0,1,0,1] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 -; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1 -; AVX512F-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 -; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512F-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 -; AVX512F-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4 -; AVX512F-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 -; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax) -; AVX512F-FAST-NEXT: addq $1432, %rsp # imm = 0x598 -; AVX512F-FAST-NEXT: vzeroupper -; AVX512F-FAST-NEXT: retq +; AVX512F-ONLY-FAST-LABEL: store_i8_stride7_vf64: +; AVX512F-ONLY-FAST: # %bb.0: +; AVX512F-ONLY-FAST-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm1, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm12, %xmm29 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] +; AVX512F-ONLY-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa (%rax), %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512F-ONLY-FAST-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm11, %xmm4 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm9, %xmm27 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm10, %xmm25 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa %xmm14, %xmm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30 +; AVX512F-ONLY-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512F-ONLY-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512F-ONLY-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512F-ONLY-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2 +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512F-ONLY-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11 +; AVX512F-ONLY-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17 +; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512F-ONLY-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512F-ONLY-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX512F-ONLY-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512F-ONLY-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8 +; AVX512F-ONLY-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19 +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512F-ONLY-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512F-ONLY-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 +; AVX512F-ONLY-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5] +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4 +; AVX512F-ONLY-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5 +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512F-ONLY-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax) +; AVX512F-ONLY-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512F-ONLY-FAST-NEXT: vzeroupper +; AVX512F-ONLY-FAST-NEXT: retq +; +; AVX512DQ-FAST-LABEL: store_i8_stride7_vf64: +; AVX512DQ-FAST: # %bb.0: +; AVX512DQ-FAST-NEXT: subq $1432, %rsp # imm = 0x598 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] +; AVX512DQ-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] +; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa %ymm2, %ymm6 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero,ymm0[18] +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 +; AVX512DQ-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[13,u,u,u,u,u],zero,ymm1[14,u,u,u,u,u],zero,ymm1[15,u,u,u,u,u],zero,ymm1[16,u,u,u,u,u],zero,ymm1[17,u,u,u] +; AVX512DQ-FAST-NEXT: vporq %ymm0, %ymm1, %ymm22 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vporq %xmm0, %xmm3, %xmm21 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512DQ-FAST-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6> +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm12, %xmm29 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128> +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm11, %xmm5 +; AVX512DQ-FAST-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm12 +; AVX512DQ-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm5 +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm10, %ymm18 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[0,1,14],zero,ymm8[12,13,0,1,14,15],zero,ymm8[3,12,13,2,3,16],zero,ymm8[30,31,28,29,16,17],zero,ymm8[31,18,19,28,29,18],zero +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %ymm9, %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm16 +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm2, %xmm9, %xmm4 +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm1 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, %xmm5 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25] +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm2[18,19,20,21],zero,ymm2[19],zero,ymm2[25,26,27,22],zero,ymm2[20],zero +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FAST-NEXT: vpermd %ymm1, %ymm6, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpandn %ymm1, %ymm4, %ymm4 +; AVX512DQ-FAST-NEXT: vmovdqa (%rax), %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm22 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512DQ-FAST-NEXT: vbroadcasti64x2 {{.*#+}} ymm24 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] +; AVX512DQ-FAST-NEXT: # ymm24 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm24, %ymm8, %ymm15 +; AVX512DQ-FAST-NEXT: vmovdqa %xmm11, %xmm4 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm15[0,1,2,3],zmm8[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm30, %xmm1 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm9, %xmm27 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm10, %xmm25 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7> +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm20, %xmm11 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; AVX512DQ-FAST-NEXT: vmovdqa %xmm14, %xmm8 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u> +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm21, %zmm30 +; AVX512DQ-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512DQ-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm7, %ymm11 +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm19, %ymm3 +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm10 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512DQ-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm19 = ymm7[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[21],zero,ymm0[19],zero,zero,zero,zero,ymm0[22],zero,ymm0[20],zero,zero +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,5,7,4,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm26 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; AVX512DQ-FAST-NEXT: vpandnq %ymm0, %ymm26, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm7 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm15, %ymm13 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm17 = ymm13[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,29,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5> +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512DQ-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm4 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,3,2,3] +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> +; AVX512DQ-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm14 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512DQ-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm15, %ymm15 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512DQ-FAST-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[9,u,7,u,u,u,u,10,u,8,u,u,u,u,11,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm2 +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpor %ymm6, %ymm4, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18374966859431673855,18446463693966278655,18374966859431673855,18446463693966278655] +; AVX512DQ-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpternlogq $248, %ymm4, %ymm11, %ymm31 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm14, %zmm31, %zmm9 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm9 +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm12, %ymm1 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vpor %ymm3, %ymm15, %ymm3 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm1, %zmm11, %zmm3 +; AVX512DQ-FAST-NEXT: vpandq %ymm4, %ymm21, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm20, %zmm1 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm4, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm19, %ymm4 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm4, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm18, %ymm1 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm5 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm5, %zmm1, %zmm1 +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = mem[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm30[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm4, %zmm5, %zmm11 +; AVX512DQ-FAST-NEXT: vpandq %ymm24, %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vporq %zmm2, %zmm4, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $226, %zmm0, %zmm5, %zmm2 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512DQ-FAST-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FAST-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm16 = zmm0[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[1,1,0,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,0,1,2,0,0,1] +; AVX512DQ-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm18 +; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FAST-NEXT: vpermd %ymm12, %ymm5, %ymm17 +; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512DQ-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512DQ-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm14 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero +; AVX512DQ-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm13 +; AVX512DQ-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512DQ-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] +; AVX512DQ-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm8 +; AVX512DQ-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} ymm19 = [4,5,4,5,5,7,4,5] +; AVX512DQ-FAST-NEXT: vpermd %ymm6, %ymm19, %ymm19 +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm20 = mem[2,3,2,3,6,7,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm20 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: # ymm21 = mem[0,1,0,1] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm21 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm21 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm22 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm22 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm29 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm29 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm1 +; AVX512DQ-FAST-NEXT: vpor %ymm5, %ymm14, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 +; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] +; AVX512DQ-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm7 +; AVX512DQ-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: # zmm3 = mem[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm23[0,1,0,1,4,5,4,5] +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm17, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpor %ymm4, %ymm15, %ymm4 +; AVX512DQ-FAST-NEXT: vpor %ymm13, %ymm8, %ymm5 +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm4[4,5,6,7] +; AVX512DQ-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm19, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 320(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 256(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm29, 192(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm22, 64(%rax) +; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm20, 384(%rax) +; AVX512DQ-FAST-NEXT: addq $1432, %rsp # imm = 0x598 +; AVX512DQ-FAST-NEXT: vzeroupper +; AVX512DQ-FAST-NEXT: retq ; ; AVX512BW-SLOW-LABEL: store_i8_stride7_vf64: ; AVX512BW-SLOW: # %bb.0: @@ -8170,7 +8639,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vmovdqu8 %zmm4, %zmm11 {%k2} ; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm20[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] ; AVX512BW-SLOW-NEXT: movl $338170920, %esi # imm = 0x14281428 ; AVX512BW-SLOW-NEXT: kmovd %esi, %k2 ; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm21, %ymm3 {%k2} @@ -8181,7 +8650,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm26 ; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm17[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,3,3,4,6,7,7] -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u> +; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] ; AVX512BW-SLOW-NEXT: vpshufb %ymm7, %ymm18, %ymm4 ; AVX512BW-SLOW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm19 = ymm4[2,3,2,3] @@ -8215,7 +8684,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm25[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm0 {%k2} -; AVX512BW-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512BW-SLOW-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512BW-SLOW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX512BW-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] @@ -8711,11 +9181,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} -; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} -; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} ; FALLBACK1: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll index 64f8ed9c20436..12d6b9cb582fc 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -65,7 +65,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; AVX2-LABEL: testv4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -93,7 +94,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; AVX512VL-LABEL: testv4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -121,7 +123,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv4i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -161,7 +164,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; X32-AVX-LABEL: testv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -247,7 +251,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX2-LABEL: testv4i64u: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -275,7 +280,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX512VL-LABEL: testv4i64u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -303,7 +309,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv4i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -343,7 +350,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; X32-AVX-LABEL: testv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -419,7 +427,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX2-LABEL: testv8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -442,7 +451,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX512VL-LABEL: testv8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -465,7 +475,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv8i32: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -500,7 +511,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; X32-AVX-LABEL: testv8i32: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -571,7 +583,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX2-LABEL: testv8i32u: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -594,7 +607,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX512VL-LABEL: testv8i32u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -617,7 +631,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv8i32u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -652,7 +667,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; X32-AVX-LABEL: testv8i32u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -713,7 +729,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX2-LABEL: testv16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -731,7 +748,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512VL-LABEL: testv16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -749,7 +767,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv16i16: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -775,7 +794,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; X32-AVX-LABEL: testv16i16: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -830,7 +850,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX2-LABEL: testv16i16u: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 @@ -848,7 +869,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX512VL-LABEL: testv16i16u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -866,7 +888,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv16i16u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %ymm3 @@ -892,7 +915,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; X32-AVX-LABEL: testv16i16u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm3 ; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm3, %ymm3 @@ -937,7 +961,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX2-LABEL: testv32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -950,7 +975,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512VL-LABEL: testv32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 @@ -963,7 +989,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv32i8: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 @@ -989,7 +1016,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; X32-AVX-LABEL: testv32i8: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 @@ -1029,7 +1057,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX2-LABEL: testv32i8u: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1042,7 +1071,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX512VL-LABEL: testv32i8u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VL-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 @@ -1055,7 +1085,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv32i8u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 @@ -1081,7 +1112,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; X32-AVX-LABEL: testv32i8u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X32-AVX-NEXT: # ymm1 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm0, %ymm1, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index c015185fe4511..a724babe469c5 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -29,9 +29,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 @@ -56,12 +57,13 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 @@ -106,9 +108,10 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 @@ -133,12 +136,13 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 @@ -181,9 +185,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 @@ -210,12 +215,13 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 @@ -266,9 +272,10 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 @@ -295,12 +302,13 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 @@ -335,7 +343,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512CD-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 @@ -361,7 +369,8 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 @@ -379,10 +388,11 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512DQ-LABEL: testv32i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7 @@ -419,7 +429,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512CD-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 @@ -445,7 +455,8 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 ; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 @@ -463,10 +474,11 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512DQ-LABEL: testv32i16u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7 @@ -508,7 +520,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CD-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm3, %zmm3 @@ -549,7 +561,8 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 @@ -561,10 +574,11 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512DQ-LABEL: testv64i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6 @@ -596,7 +610,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CD-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm3, %zmm3 @@ -637,7 +651,8 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vbroadcasti64x2 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: # zmm2 = mem[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 @@ -649,10 +664,11 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512DQ-LABEL: testv64i8u: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll index 56daf987c829e..8e8aca15860d5 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -107,7 +107,7 @@ define <16 x i8> @ult_2_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -115,7 +115,7 @@ define <16 x i8> @ult_2_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_2_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -210,7 +210,7 @@ define <16 x i8> @ugt_2_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_2_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -347,7 +347,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -355,7 +355,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -364,7 +364,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -374,7 +374,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -383,7 +383,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -391,7 +391,7 @@ define <16 x i8> @ult_3_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_3_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -486,7 +486,7 @@ define <16 x i8> @ugt_3_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -623,7 +623,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -631,7 +631,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -640,7 +640,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -650,7 +650,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -659,7 +659,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -667,7 +667,7 @@ define <16 x i8> @ult_4_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_4_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -762,7 +762,7 @@ define <16 x i8> @ugt_4_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -899,7 +899,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_5_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -907,7 +907,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -916,7 +916,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -926,7 +926,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -935,7 +935,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -943,7 +943,7 @@ define <16 x i8> @ult_5_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_5_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1038,7 +1038,7 @@ define <16 x i8> @ugt_5_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_5_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1175,7 +1175,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_6_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1183,7 +1183,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1192,7 +1192,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1202,7 +1202,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1211,7 +1211,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1219,7 +1219,7 @@ define <16 x i8> @ult_6_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_6_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1314,7 +1314,7 @@ define <16 x i8> @ugt_6_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ugt_6_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1451,7 +1451,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; ; AVX2-LABEL: ult_7_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1459,7 +1459,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1468,7 +1468,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1478,7 +1478,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1487,7 +1487,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1495,7 +1495,7 @@ define <16 x i8> @ult_7_v16i8(<16 x i8> %0) { ; BITALG-LABEL: ult_7_v16i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %0) @@ -1600,7 +1600,7 @@ define <8 x i16> @ult_2_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1608,7 +1608,7 @@ define <8 x i16> @ult_2_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_2_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -1720,7 +1720,7 @@ define <8 x i16> @ugt_2_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_2_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1879,7 +1879,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1890,7 +1890,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1899,7 +1899,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1909,7 +1909,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -1918,7 +1918,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1926,7 +1926,7 @@ define <8 x i16> @ult_3_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_3_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2038,7 +2038,7 @@ define <8 x i16> @ugt_3_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2197,7 +2197,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2208,7 +2208,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2217,7 +2217,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2227,7 +2227,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -2236,7 +2236,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2244,7 +2244,7 @@ define <8 x i16> @ult_4_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_4_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2356,7 +2356,7 @@ define <8 x i16> @ugt_4_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2515,7 +2515,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2526,7 +2526,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2535,7 +2535,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2545,7 +2545,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -2554,7 +2554,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2562,7 +2562,7 @@ define <8 x i16> @ult_5_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_5_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2674,7 +2674,7 @@ define <8 x i16> @ugt_5_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2833,7 +2833,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2844,7 +2844,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2853,7 +2853,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2863,7 +2863,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -2872,7 +2872,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2880,7 +2880,7 @@ define <8 x i16> @ult_6_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_6_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -2992,7 +2992,7 @@ define <8 x i16> @ugt_6_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3151,7 +3151,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_7_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3162,7 +3162,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3171,7 +3171,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3181,7 +3181,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -3190,7 +3190,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3198,7 +3198,7 @@ define <8 x i16> @ult_7_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_7_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3310,7 +3310,7 @@ define <8 x i16> @ugt_7_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_7_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3469,7 +3469,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_8_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3480,7 +3480,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3489,7 +3489,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3499,7 +3499,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -3508,7 +3508,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3516,7 +3516,7 @@ define <8 x i16> @ult_8_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_8_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3628,7 +3628,7 @@ define <8 x i16> @ugt_8_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_8_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3787,7 +3787,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_9_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3798,7 +3798,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3807,7 +3807,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3817,7 +3817,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -3826,7 +3826,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3834,7 +3834,7 @@ define <8 x i16> @ult_9_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_9_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -3946,7 +3946,7 @@ define <8 x i16> @ugt_9_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_9_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4105,7 +4105,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_10_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4116,7 +4116,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4125,7 +4125,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4135,7 +4135,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -4144,7 +4144,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4152,7 +4152,7 @@ define <8 x i16> @ult_10_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_10_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4264,7 +4264,7 @@ define <8 x i16> @ugt_10_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_10_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4423,7 +4423,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_11_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4434,7 +4434,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4443,7 +4443,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4453,7 +4453,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -4462,7 +4462,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4470,7 +4470,7 @@ define <8 x i16> @ult_11_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_11_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4582,7 +4582,7 @@ define <8 x i16> @ugt_11_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_11_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4741,7 +4741,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_12_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4752,7 +4752,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4761,7 +4761,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4771,7 +4771,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -4780,7 +4780,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4788,7 +4788,7 @@ define <8 x i16> @ult_12_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_12_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -4900,7 +4900,7 @@ define <8 x i16> @ugt_12_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_12_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5059,7 +5059,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_13_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5070,7 +5070,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5079,7 +5079,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5089,7 +5089,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -5098,7 +5098,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5106,7 +5106,7 @@ define <8 x i16> @ult_13_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_13_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -5218,7 +5218,7 @@ define <8 x i16> @ugt_13_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_13_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5377,7 +5377,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_14_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5388,7 +5388,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5397,7 +5397,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5407,7 +5407,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -5416,7 +5416,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5424,7 +5424,7 @@ define <8 x i16> @ult_14_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_14_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -5536,7 +5536,7 @@ define <8 x i16> @ugt_14_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ugt_14_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5695,7 +5695,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; ; AVX2-LABEL: ult_15_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5706,7 +5706,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5715,7 +5715,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5725,7 +5725,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vzeroupper ; AVX512VPOPCNTDQVL-NEXT: retq @@ -5734,7 +5734,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5742,7 +5742,7 @@ define <8 x i16> @ult_15_v8i16(<8 x i16> %0) { ; BITALG-LABEL: ult_15_v8i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; BITALG-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; BITALG-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG-NEXT: retq %2 = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %0) @@ -6011,7 +6011,7 @@ define <4 x i32> @ugt_2_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_2_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6200,7 +6200,7 @@ define <4 x i32> @ult_3_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6385,7 +6385,7 @@ define <4 x i32> @ugt_3_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6574,7 +6574,7 @@ define <4 x i32> @ult_4_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6759,7 +6759,7 @@ define <4 x i32> @ugt_4_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6948,7 +6948,7 @@ define <4 x i32> @ult_5_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7133,7 +7133,7 @@ define <4 x i32> @ugt_5_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7322,7 +7322,7 @@ define <4 x i32> @ult_6_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7507,7 +7507,7 @@ define <4 x i32> @ugt_6_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7696,7 +7696,7 @@ define <4 x i32> @ult_7_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_7_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7881,7 +7881,7 @@ define <4 x i32> @ugt_7_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_7_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8070,7 +8070,7 @@ define <4 x i32> @ult_8_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_8_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8255,7 +8255,7 @@ define <4 x i32> @ugt_8_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_8_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8444,7 +8444,7 @@ define <4 x i32> @ult_9_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_9_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8629,7 +8629,7 @@ define <4 x i32> @ugt_9_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_9_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8818,7 +8818,7 @@ define <4 x i32> @ult_10_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_10_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9003,7 +9003,7 @@ define <4 x i32> @ugt_10_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_10_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9192,7 +9192,7 @@ define <4 x i32> @ult_11_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_11_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9377,7 +9377,7 @@ define <4 x i32> @ugt_11_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_11_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9566,7 +9566,7 @@ define <4 x i32> @ult_12_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_12_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9751,7 +9751,7 @@ define <4 x i32> @ugt_12_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_12_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9940,7 +9940,7 @@ define <4 x i32> @ult_13_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_13_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10125,7 +10125,7 @@ define <4 x i32> @ugt_13_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_13_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10314,7 +10314,7 @@ define <4 x i32> @ult_14_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_14_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10499,7 +10499,7 @@ define <4 x i32> @ugt_14_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_14_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10688,7 +10688,7 @@ define <4 x i32> @ult_15_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_15_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10873,7 +10873,7 @@ define <4 x i32> @ugt_15_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_15_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11062,7 +11062,7 @@ define <4 x i32> @ult_16_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_16_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11247,7 +11247,7 @@ define <4 x i32> @ugt_16_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_16_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11436,7 +11436,7 @@ define <4 x i32> @ult_17_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_17_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11621,7 +11621,7 @@ define <4 x i32> @ugt_17_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_17_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11810,7 +11810,7 @@ define <4 x i32> @ult_18_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_18_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11995,7 +11995,7 @@ define <4 x i32> @ugt_18_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_18_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12184,7 +12184,7 @@ define <4 x i32> @ult_19_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_19_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12369,7 +12369,7 @@ define <4 x i32> @ugt_19_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_19_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12558,7 +12558,7 @@ define <4 x i32> @ult_20_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_20_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12743,7 +12743,7 @@ define <4 x i32> @ugt_20_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_20_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12932,7 +12932,7 @@ define <4 x i32> @ult_21_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_21_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13117,7 +13117,7 @@ define <4 x i32> @ugt_21_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_21_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13306,7 +13306,7 @@ define <4 x i32> @ult_22_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_22_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13491,7 +13491,7 @@ define <4 x i32> @ugt_22_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_22_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13680,7 +13680,7 @@ define <4 x i32> @ult_23_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_23_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13865,7 +13865,7 @@ define <4 x i32> @ugt_23_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_23_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14054,7 +14054,7 @@ define <4 x i32> @ult_24_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_24_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14239,7 +14239,7 @@ define <4 x i32> @ugt_24_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_24_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14428,7 +14428,7 @@ define <4 x i32> @ult_25_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_25_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14613,7 +14613,7 @@ define <4 x i32> @ugt_25_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_25_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14802,7 +14802,7 @@ define <4 x i32> @ult_26_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_26_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14987,7 +14987,7 @@ define <4 x i32> @ugt_26_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_26_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15176,7 +15176,7 @@ define <4 x i32> @ult_27_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_27_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15361,7 +15361,7 @@ define <4 x i32> @ugt_27_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_27_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15550,7 +15550,7 @@ define <4 x i32> @ult_28_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_28_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15735,7 +15735,7 @@ define <4 x i32> @ugt_28_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_28_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15924,7 +15924,7 @@ define <4 x i32> @ult_29_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_29_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16109,7 +16109,7 @@ define <4 x i32> @ugt_29_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_29_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16298,7 +16298,7 @@ define <4 x i32> @ult_30_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_30_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16483,7 +16483,7 @@ define <4 x i32> @ugt_30_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ugt_30_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16672,7 +16672,7 @@ define <4 x i32> @ult_31_v4i32(<4 x i32> %0) { ; ; AVX2-LABEL: ult_31_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16917,7 +16917,7 @@ define <2 x i64> @ult_2_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17054,7 +17054,7 @@ define <2 x i64> @ugt_2_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_2_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17213,7 +17213,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17223,7 +17223,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17231,7 +17231,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17249,7 +17249,7 @@ define <2 x i64> @ult_3_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17370,7 +17370,7 @@ define <2 x i64> @ugt_3_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17529,7 +17529,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17539,7 +17539,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17547,7 +17547,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17565,7 +17565,7 @@ define <2 x i64> @ult_4_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17686,7 +17686,7 @@ define <2 x i64> @ugt_4_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17845,7 +17845,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17855,7 +17855,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17863,7 +17863,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17881,7 +17881,7 @@ define <2 x i64> @ult_5_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18002,7 +18002,7 @@ define <2 x i64> @ugt_5_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18161,7 +18161,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18171,7 +18171,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18179,7 +18179,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18197,7 +18197,7 @@ define <2 x i64> @ult_6_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18318,7 +18318,7 @@ define <2 x i64> @ugt_6_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18477,7 +18477,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_7_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18487,7 +18487,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18495,7 +18495,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18513,7 +18513,7 @@ define <2 x i64> @ult_7_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18634,7 +18634,7 @@ define <2 x i64> @ugt_7_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_7_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18793,7 +18793,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18803,7 +18803,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18811,7 +18811,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18829,7 +18829,7 @@ define <2 x i64> @ult_8_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18950,7 +18950,7 @@ define <2 x i64> @ugt_8_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19109,7 +19109,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_9_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19119,7 +19119,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19127,7 +19127,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19145,7 +19145,7 @@ define <2 x i64> @ult_9_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19266,7 +19266,7 @@ define <2 x i64> @ugt_9_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_9_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19425,7 +19425,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_10_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19435,7 +19435,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19443,7 +19443,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19461,7 +19461,7 @@ define <2 x i64> @ult_10_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19582,7 +19582,7 @@ define <2 x i64> @ugt_10_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_10_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19741,7 +19741,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_11_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19751,7 +19751,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19759,7 +19759,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19777,7 +19777,7 @@ define <2 x i64> @ult_11_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19898,7 +19898,7 @@ define <2 x i64> @ugt_11_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_11_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20057,7 +20057,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_12_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20067,7 +20067,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20075,7 +20075,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20093,7 +20093,7 @@ define <2 x i64> @ult_12_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20214,7 +20214,7 @@ define <2 x i64> @ugt_12_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_12_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20373,7 +20373,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_13_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20383,7 +20383,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20391,7 +20391,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20409,7 +20409,7 @@ define <2 x i64> @ult_13_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20530,7 +20530,7 @@ define <2 x i64> @ugt_13_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_13_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20689,7 +20689,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_14_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20699,7 +20699,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20707,7 +20707,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20725,7 +20725,7 @@ define <2 x i64> @ult_14_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20846,7 +20846,7 @@ define <2 x i64> @ugt_14_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_14_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21005,7 +21005,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_15_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21015,7 +21015,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21023,7 +21023,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21041,7 +21041,7 @@ define <2 x i64> @ult_15_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21162,7 +21162,7 @@ define <2 x i64> @ugt_15_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_15_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21321,7 +21321,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_16_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21331,7 +21331,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21339,7 +21339,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21357,7 +21357,7 @@ define <2 x i64> @ult_16_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21478,7 +21478,7 @@ define <2 x i64> @ugt_16_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_16_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21637,7 +21637,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_17_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21647,7 +21647,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21655,7 +21655,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21673,7 +21673,7 @@ define <2 x i64> @ult_17_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21794,7 +21794,7 @@ define <2 x i64> @ugt_17_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_17_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21953,7 +21953,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_18_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21963,7 +21963,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21971,7 +21971,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21989,7 +21989,7 @@ define <2 x i64> @ult_18_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22110,7 +22110,7 @@ define <2 x i64> @ugt_18_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_18_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22269,7 +22269,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_19_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22279,7 +22279,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22287,7 +22287,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22305,7 +22305,7 @@ define <2 x i64> @ult_19_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22426,7 +22426,7 @@ define <2 x i64> @ugt_19_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_19_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22585,7 +22585,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_20_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22595,7 +22595,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22603,7 +22603,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22621,7 +22621,7 @@ define <2 x i64> @ult_20_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22742,7 +22742,7 @@ define <2 x i64> @ugt_20_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_20_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22901,7 +22901,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_21_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22911,7 +22911,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22919,7 +22919,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22937,7 +22937,7 @@ define <2 x i64> @ult_21_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23058,7 +23058,7 @@ define <2 x i64> @ugt_21_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_21_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23217,7 +23217,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_22_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23227,7 +23227,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23235,7 +23235,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23253,7 +23253,7 @@ define <2 x i64> @ult_22_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23374,7 +23374,7 @@ define <2 x i64> @ugt_22_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_22_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23533,7 +23533,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_23_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23543,7 +23543,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23551,7 +23551,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23569,7 +23569,7 @@ define <2 x i64> @ult_23_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23690,7 +23690,7 @@ define <2 x i64> @ugt_23_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_23_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23849,7 +23849,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_24_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23859,7 +23859,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23867,7 +23867,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23885,7 +23885,7 @@ define <2 x i64> @ult_24_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24006,7 +24006,7 @@ define <2 x i64> @ugt_24_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_24_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24165,7 +24165,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_25_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24175,7 +24175,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24183,7 +24183,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24201,7 +24201,7 @@ define <2 x i64> @ult_25_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24322,7 +24322,7 @@ define <2 x i64> @ugt_25_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_25_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24481,7 +24481,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_26_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24491,7 +24491,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24499,7 +24499,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24517,7 +24517,7 @@ define <2 x i64> @ult_26_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24638,7 +24638,7 @@ define <2 x i64> @ugt_26_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_26_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24797,7 +24797,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_27_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24807,7 +24807,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24815,7 +24815,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24833,7 +24833,7 @@ define <2 x i64> @ult_27_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24954,7 +24954,7 @@ define <2 x i64> @ugt_27_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_27_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25113,7 +25113,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_28_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25123,7 +25123,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25131,7 +25131,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25149,7 +25149,7 @@ define <2 x i64> @ult_28_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25270,7 +25270,7 @@ define <2 x i64> @ugt_28_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_28_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25429,7 +25429,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_29_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25439,7 +25439,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25447,7 +25447,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25465,7 +25465,7 @@ define <2 x i64> @ult_29_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25586,7 +25586,7 @@ define <2 x i64> @ugt_29_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_29_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25745,7 +25745,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_30_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25755,7 +25755,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25763,7 +25763,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25781,7 +25781,7 @@ define <2 x i64> @ult_30_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25902,7 +25902,7 @@ define <2 x i64> @ugt_30_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_30_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26061,7 +26061,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_31_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26071,7 +26071,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26079,7 +26079,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26097,7 +26097,7 @@ define <2 x i64> @ult_31_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26218,7 +26218,7 @@ define <2 x i64> @ugt_31_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_31_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26377,7 +26377,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_32_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26387,7 +26387,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26395,7 +26395,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26413,7 +26413,7 @@ define <2 x i64> @ult_32_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26534,7 +26534,7 @@ define <2 x i64> @ugt_32_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_32_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26693,7 +26693,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_33_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26703,7 +26703,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26711,7 +26711,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26729,7 +26729,7 @@ define <2 x i64> @ult_33_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26850,7 +26850,7 @@ define <2 x i64> @ugt_33_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_33_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27009,7 +27009,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_34_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27019,7 +27019,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27027,7 +27027,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27045,7 +27045,7 @@ define <2 x i64> @ult_34_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27166,7 +27166,7 @@ define <2 x i64> @ugt_34_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_34_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27325,7 +27325,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_35_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27335,7 +27335,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27343,7 +27343,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27361,7 +27361,7 @@ define <2 x i64> @ult_35_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27482,7 +27482,7 @@ define <2 x i64> @ugt_35_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_35_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27641,7 +27641,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_36_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27651,7 +27651,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27659,7 +27659,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27677,7 +27677,7 @@ define <2 x i64> @ult_36_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27798,7 +27798,7 @@ define <2 x i64> @ugt_36_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_36_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27957,7 +27957,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_37_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27967,7 +27967,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27975,7 +27975,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27993,7 +27993,7 @@ define <2 x i64> @ult_37_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28114,7 +28114,7 @@ define <2 x i64> @ugt_37_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_37_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28273,7 +28273,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_38_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28283,7 +28283,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28291,7 +28291,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28309,7 +28309,7 @@ define <2 x i64> @ult_38_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28430,7 +28430,7 @@ define <2 x i64> @ugt_38_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_38_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28589,7 +28589,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_39_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28599,7 +28599,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28607,7 +28607,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28625,7 +28625,7 @@ define <2 x i64> @ult_39_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28746,7 +28746,7 @@ define <2 x i64> @ugt_39_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_39_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28905,7 +28905,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_40_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28915,7 +28915,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28923,7 +28923,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28941,7 +28941,7 @@ define <2 x i64> @ult_40_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29062,7 +29062,7 @@ define <2 x i64> @ugt_40_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_40_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29221,7 +29221,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_41_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29231,7 +29231,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29239,7 +29239,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29257,7 +29257,7 @@ define <2 x i64> @ult_41_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29378,7 +29378,7 @@ define <2 x i64> @ugt_41_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_41_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29537,7 +29537,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_42_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29547,7 +29547,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29555,7 +29555,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29573,7 +29573,7 @@ define <2 x i64> @ult_42_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29694,7 +29694,7 @@ define <2 x i64> @ugt_42_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_42_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29853,7 +29853,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_43_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29863,7 +29863,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29871,7 +29871,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29889,7 +29889,7 @@ define <2 x i64> @ult_43_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30010,7 +30010,7 @@ define <2 x i64> @ugt_43_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_43_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30169,7 +30169,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_44_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30179,7 +30179,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30187,7 +30187,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30205,7 +30205,7 @@ define <2 x i64> @ult_44_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30326,7 +30326,7 @@ define <2 x i64> @ugt_44_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_44_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30485,7 +30485,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_45_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30495,7 +30495,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30503,7 +30503,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30521,7 +30521,7 @@ define <2 x i64> @ult_45_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30642,7 +30642,7 @@ define <2 x i64> @ugt_45_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_45_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30801,7 +30801,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_46_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30811,7 +30811,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30819,7 +30819,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30837,7 +30837,7 @@ define <2 x i64> @ult_46_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30958,7 +30958,7 @@ define <2 x i64> @ugt_46_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_46_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31117,7 +31117,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_47_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31127,7 +31127,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31135,7 +31135,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31153,7 +31153,7 @@ define <2 x i64> @ult_47_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31274,7 +31274,7 @@ define <2 x i64> @ugt_47_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_47_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31433,7 +31433,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_48_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31443,7 +31443,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31451,7 +31451,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31469,7 +31469,7 @@ define <2 x i64> @ult_48_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31590,7 +31590,7 @@ define <2 x i64> @ugt_48_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_48_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31749,7 +31749,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_49_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31759,7 +31759,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31767,7 +31767,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31785,7 +31785,7 @@ define <2 x i64> @ult_49_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31906,7 +31906,7 @@ define <2 x i64> @ugt_49_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_49_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32065,7 +32065,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_50_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32075,7 +32075,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32083,7 +32083,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32101,7 +32101,7 @@ define <2 x i64> @ult_50_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32222,7 +32222,7 @@ define <2 x i64> @ugt_50_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_50_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32381,7 +32381,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_51_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32391,7 +32391,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32399,7 +32399,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32417,7 +32417,7 @@ define <2 x i64> @ult_51_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32538,7 +32538,7 @@ define <2 x i64> @ugt_51_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_51_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32697,7 +32697,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_52_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32707,7 +32707,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32715,7 +32715,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32733,7 +32733,7 @@ define <2 x i64> @ult_52_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32854,7 +32854,7 @@ define <2 x i64> @ugt_52_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_52_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33013,7 +33013,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_53_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33023,7 +33023,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33031,7 +33031,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33049,7 +33049,7 @@ define <2 x i64> @ult_53_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33170,7 +33170,7 @@ define <2 x i64> @ugt_53_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_53_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33329,7 +33329,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_54_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33339,7 +33339,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33347,7 +33347,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33365,7 +33365,7 @@ define <2 x i64> @ult_54_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33486,7 +33486,7 @@ define <2 x i64> @ugt_54_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_54_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33645,7 +33645,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_55_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33655,7 +33655,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33663,7 +33663,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33681,7 +33681,7 @@ define <2 x i64> @ult_55_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33802,7 +33802,7 @@ define <2 x i64> @ugt_55_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_55_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33961,7 +33961,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_56_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33971,7 +33971,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33979,7 +33979,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33997,7 +33997,7 @@ define <2 x i64> @ult_56_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34118,7 +34118,7 @@ define <2 x i64> @ugt_56_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_56_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34277,7 +34277,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_57_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34287,7 +34287,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34295,7 +34295,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34313,7 +34313,7 @@ define <2 x i64> @ult_57_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34434,7 +34434,7 @@ define <2 x i64> @ugt_57_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_57_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34593,7 +34593,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_58_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34603,7 +34603,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34611,7 +34611,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34629,7 +34629,7 @@ define <2 x i64> @ult_58_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34750,7 +34750,7 @@ define <2 x i64> @ugt_58_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_58_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34909,7 +34909,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_59_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34919,7 +34919,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34927,7 +34927,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34945,7 +34945,7 @@ define <2 x i64> @ult_59_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35066,7 +35066,7 @@ define <2 x i64> @ugt_59_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_59_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35225,7 +35225,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_60_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35235,7 +35235,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35243,7 +35243,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35261,7 +35261,7 @@ define <2 x i64> @ult_60_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35382,7 +35382,7 @@ define <2 x i64> @ugt_60_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_60_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35541,7 +35541,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_61_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35551,7 +35551,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35559,7 +35559,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35577,7 +35577,7 @@ define <2 x i64> @ult_61_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35698,7 +35698,7 @@ define <2 x i64> @ugt_61_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_61_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35857,7 +35857,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_62_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35867,7 +35867,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35875,7 +35875,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35893,7 +35893,7 @@ define <2 x i64> @ult_62_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -36014,7 +36014,7 @@ define <2 x i64> @ugt_62_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ugt_62_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36173,7 +36173,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; ; AVX2-LABEL: ult_63_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36183,7 +36183,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -36191,7 +36191,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -36209,7 +36209,7 @@ define <2 x i64> @ult_63_v2i64(<2 x i64> %0) { ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll index a03e34dc46b32..21792140625cd 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -3,8 +3,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VPOPCNTDQ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VPOPCNTDQVL @@ -84,19 +84,33 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv2i64: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: testv2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: testv2i64: ; XOP: # %bb.0: @@ -235,23 +249,41 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv4i32: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: testv4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: testv4i32: ; XOP: # %bb.0: @@ -390,20 +422,35 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv8i16: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1OR2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: testv8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: testv8i16: ; XOP: # %bb.0: @@ -518,17 +565,29 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv16i8: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: testv16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: testv16i8: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll index c5bb1dfe6001b..fad3effc66f9f 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -119,14 +119,14 @@ define <32 x i8> @ult_2_v32i8(<32 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_2_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -161,9 +161,10 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ugt_2_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -174,9 +175,10 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -187,9 +189,10 @@ define <32 x i8> @ugt_2_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_2_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -242,43 +245,46 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ult_3_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_3_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_3_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -286,14 +292,14 @@ define <32 x i8> @ult_3_v32i8(<32 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_3_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -328,9 +334,10 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ugt_3_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -341,9 +348,10 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -354,9 +362,10 @@ define <32 x i8> @ugt_3_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_3_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -409,43 +418,46 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ult_4_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_4_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_4_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -453,14 +465,14 @@ define <32 x i8> @ult_4_v32i8(<32 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_4_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -495,9 +507,10 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ugt_4_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -508,9 +521,10 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -521,9 +535,10 @@ define <32 x i8> @ugt_4_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_4_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -576,43 +591,46 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ult_5_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_5_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_5_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -620,14 +638,14 @@ define <32 x i8> @ult_5_v32i8(<32 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_5_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -662,9 +680,10 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ugt_5_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -675,9 +694,10 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -688,9 +708,10 @@ define <32 x i8> @ugt_5_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_5_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -743,43 +764,46 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ult_6_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_6_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_6_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -787,14 +811,14 @@ define <32 x i8> @ult_6_v32i8(<32 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_6_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -829,9 +853,10 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ugt_6_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -842,9 +867,10 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -855,9 +881,10 @@ define <32 x i8> @ugt_6_v32i8(<32 x i8> %0) { ; ; AVX512VPOPCNTDQVL-LABEL: ugt_6_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -910,43 +937,46 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) { ; ; AVX2-LABEL: ult_7_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_7_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: ult_7_v32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -954,14 +984,14 @@ define <32 x i8> @ult_7_v32i8(<32 x i8> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_7_v32i8: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %0) @@ -1082,14 +1112,14 @@ define <16 x i16> @ult_2_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_2_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1130,9 +1160,10 @@ define <16 x i16> @ugt_2_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_2_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1210,9 +1241,10 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_3_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1221,7 +1253,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1230,7 +1262,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1239,7 +1271,7 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1247,14 +1279,14 @@ define <16 x i16> @ult_3_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_3_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1295,9 +1327,10 @@ define <16 x i16> @ugt_3_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_3_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1375,9 +1408,10 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_4_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1386,7 +1420,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1395,7 +1429,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1404,7 +1438,7 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1412,14 +1446,14 @@ define <16 x i16> @ult_4_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_4_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1460,9 +1494,10 @@ define <16 x i16> @ugt_4_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_4_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1540,9 +1575,10 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_5_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1551,7 +1587,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1560,7 +1596,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1569,7 +1605,7 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1577,14 +1613,14 @@ define <16 x i16> @ult_5_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_5_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1625,9 +1661,10 @@ define <16 x i16> @ugt_5_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_5_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1705,9 +1742,10 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_6_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1716,7 +1754,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1725,7 +1763,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1734,7 +1772,7 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1742,14 +1780,14 @@ define <16 x i16> @ult_6_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_6_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1790,9 +1828,10 @@ define <16 x i16> @ugt_6_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_6_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1870,9 +1909,10 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_7_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1881,7 +1921,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1890,7 +1930,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1899,7 +1939,7 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -1907,14 +1947,14 @@ define <16 x i16> @ult_7_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_7_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -1955,9 +1995,10 @@ define <16 x i16> @ugt_7_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_7_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2035,9 +2076,10 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_8_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2046,7 +2088,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2055,7 +2097,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2064,7 +2106,7 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2072,14 +2114,14 @@ define <16 x i16> @ult_8_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_8_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2120,9 +2162,10 @@ define <16 x i16> @ugt_8_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_8_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2200,9 +2243,10 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_9_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2211,7 +2255,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2220,7 +2264,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2229,7 +2273,7 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2237,14 +2281,14 @@ define <16 x i16> @ult_9_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_9_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2285,9 +2329,10 @@ define <16 x i16> @ugt_9_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_9_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2365,9 +2410,10 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_10_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2376,7 +2422,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2385,7 +2431,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2394,7 +2440,7 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2402,14 +2448,14 @@ define <16 x i16> @ult_10_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_10_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2450,9 +2496,10 @@ define <16 x i16> @ugt_10_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_10_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2530,9 +2577,10 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_11_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2541,7 +2589,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2550,7 +2598,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2559,7 +2607,7 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2567,14 +2615,14 @@ define <16 x i16> @ult_11_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_11_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2615,9 +2663,10 @@ define <16 x i16> @ugt_11_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_11_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2695,9 +2744,10 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_12_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2706,7 +2756,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2715,7 +2765,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2724,7 +2774,7 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2732,14 +2782,14 @@ define <16 x i16> @ult_12_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_12_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2780,9 +2830,10 @@ define <16 x i16> @ugt_12_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_12_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2860,9 +2911,10 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_13_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -2871,7 +2923,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2880,7 +2932,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2889,7 +2941,7 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -2897,14 +2949,14 @@ define <16 x i16> @ult_13_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_13_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -2945,9 +2997,10 @@ define <16 x i16> @ugt_13_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_13_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3025,9 +3078,10 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_14_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3036,7 +3090,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3045,7 +3099,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -3054,7 +3108,7 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -3062,14 +3116,14 @@ define <16 x i16> @ult_14_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_14_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -3110,9 +3164,10 @@ define <16 x i16> @ugt_14_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ugt_14_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3190,9 +3245,10 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; ; AVX2-LABEL: ult_15_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3201,7 +3257,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3210,7 +3266,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -3219,7 +3275,7 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; @@ -3227,14 +3283,14 @@ define <16 x i16> @ult_15_v16i16(<16 x i16> %0) { ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: ult_15_v16i16: ; BITALG: # %bb.0: ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; BITALG-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; BITALG-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG-NEXT: retq %2 = tail call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %0) @@ -3410,9 +3466,10 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_2_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3512,9 +3569,10 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_3_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3614,9 +3672,10 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_3_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3716,9 +3775,10 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_4_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3818,9 +3878,10 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_4_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -3920,9 +3981,10 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_5_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4022,9 +4084,10 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_5_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4124,9 +4187,10 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_6_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4226,9 +4290,10 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_6_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4328,9 +4393,10 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_7_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4430,9 +4496,10 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_7_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4532,9 +4599,10 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_8_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4634,9 +4702,10 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_8_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4736,9 +4805,10 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_9_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4838,9 +4908,10 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_9_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -4940,9 +5011,10 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_10_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5042,9 +5114,10 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_10_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5144,9 +5217,10 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_11_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5246,9 +5320,10 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_11_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5348,9 +5423,10 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_12_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5450,9 +5526,10 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_12_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5552,9 +5629,10 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_13_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5654,9 +5732,10 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_13_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5756,9 +5835,10 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_14_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5858,9 +5938,10 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_14_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -5960,9 +6041,10 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_15_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6062,9 +6144,10 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_15_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6164,9 +6247,10 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6266,9 +6350,10 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6368,9 +6453,10 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_17_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6470,9 +6556,10 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_17_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6572,9 +6659,10 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_18_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6674,9 +6762,10 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_18_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6776,9 +6865,10 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_19_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6878,9 +6968,10 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_19_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -6980,9 +7071,10 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_20_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7082,9 +7174,10 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_20_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7184,9 +7277,10 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_21_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7286,9 +7380,10 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_21_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7388,9 +7483,10 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_22_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7490,9 +7586,10 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_22_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7592,9 +7689,10 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_23_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7694,9 +7792,10 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_23_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7796,9 +7895,10 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_24_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -7898,9 +7998,10 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_24_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8000,9 +8101,10 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_25_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8102,9 +8204,10 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_25_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8204,9 +8307,10 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_26_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8306,9 +8410,10 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_26_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8408,9 +8513,10 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_27_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8510,9 +8616,10 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_27_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8612,9 +8719,10 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_28_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8714,9 +8822,10 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_28_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8816,9 +8925,10 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_29_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -8918,9 +9028,10 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_29_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9020,9 +9131,10 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_30_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9122,9 +9234,10 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ugt_30_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9224,9 +9337,10 @@ define <8 x i32> @ult_31_v8i32(<8 x i32> %0) { ; ; AVX2-LABEL: ult_31_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9448,9 +9562,10 @@ define <4 x i64> @ugt_2_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_2_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9530,9 +9645,10 @@ define <4 x i64> @ult_3_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_3_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9612,9 +9728,10 @@ define <4 x i64> @ugt_3_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_3_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9694,9 +9811,10 @@ define <4 x i64> @ult_4_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_4_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9776,9 +9894,10 @@ define <4 x i64> @ugt_4_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_4_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9858,9 +9977,10 @@ define <4 x i64> @ult_5_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_5_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -9940,9 +10060,10 @@ define <4 x i64> @ugt_5_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_5_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10022,9 +10143,10 @@ define <4 x i64> @ult_6_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_6_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10104,9 +10226,10 @@ define <4 x i64> @ugt_6_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_6_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10186,9 +10309,10 @@ define <4 x i64> @ult_7_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_7_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10268,9 +10392,10 @@ define <4 x i64> @ugt_7_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_7_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10350,9 +10475,10 @@ define <4 x i64> @ult_8_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_8_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10432,9 +10558,10 @@ define <4 x i64> @ugt_8_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_8_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10514,9 +10641,10 @@ define <4 x i64> @ult_9_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_9_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10596,9 +10724,10 @@ define <4 x i64> @ugt_9_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_9_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10678,9 +10807,10 @@ define <4 x i64> @ult_10_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_10_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10760,9 +10890,10 @@ define <4 x i64> @ugt_10_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_10_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10842,9 +10973,10 @@ define <4 x i64> @ult_11_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_11_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -10924,9 +11056,10 @@ define <4 x i64> @ugt_11_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_11_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11006,9 +11139,10 @@ define <4 x i64> @ult_12_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_12_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11088,9 +11222,10 @@ define <4 x i64> @ugt_12_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_12_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11170,9 +11305,10 @@ define <4 x i64> @ult_13_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_13_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11252,9 +11388,10 @@ define <4 x i64> @ugt_13_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_13_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11334,9 +11471,10 @@ define <4 x i64> @ult_14_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_14_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11416,9 +11554,10 @@ define <4 x i64> @ugt_14_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_14_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11498,9 +11637,10 @@ define <4 x i64> @ult_15_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_15_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11580,9 +11720,10 @@ define <4 x i64> @ugt_15_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_15_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11662,9 +11803,10 @@ define <4 x i64> @ult_16_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_16_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11744,9 +11886,10 @@ define <4 x i64> @ugt_16_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_16_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11826,9 +11969,10 @@ define <4 x i64> @ult_17_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_17_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11908,9 +12052,10 @@ define <4 x i64> @ugt_17_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_17_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -11990,9 +12135,10 @@ define <4 x i64> @ult_18_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_18_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12072,9 +12218,10 @@ define <4 x i64> @ugt_18_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_18_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12154,9 +12301,10 @@ define <4 x i64> @ult_19_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_19_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12236,9 +12384,10 @@ define <4 x i64> @ugt_19_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_19_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12318,9 +12467,10 @@ define <4 x i64> @ult_20_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_20_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12400,9 +12550,10 @@ define <4 x i64> @ugt_20_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_20_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12482,9 +12633,10 @@ define <4 x i64> @ult_21_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_21_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12564,9 +12716,10 @@ define <4 x i64> @ugt_21_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_21_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12646,9 +12799,10 @@ define <4 x i64> @ult_22_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_22_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12728,9 +12882,10 @@ define <4 x i64> @ugt_22_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_22_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12810,9 +12965,10 @@ define <4 x i64> @ult_23_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_23_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12892,9 +13048,10 @@ define <4 x i64> @ugt_23_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_23_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -12974,9 +13131,10 @@ define <4 x i64> @ult_24_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_24_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13056,9 +13214,10 @@ define <4 x i64> @ugt_24_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_24_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13138,9 +13297,10 @@ define <4 x i64> @ult_25_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_25_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13220,9 +13380,10 @@ define <4 x i64> @ugt_25_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_25_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13302,9 +13463,10 @@ define <4 x i64> @ult_26_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_26_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13384,9 +13546,10 @@ define <4 x i64> @ugt_26_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_26_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13466,9 +13629,10 @@ define <4 x i64> @ult_27_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_27_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13548,9 +13712,10 @@ define <4 x i64> @ugt_27_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_27_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13630,9 +13795,10 @@ define <4 x i64> @ult_28_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_28_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13712,9 +13878,10 @@ define <4 x i64> @ugt_28_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_28_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13794,9 +13961,10 @@ define <4 x i64> @ult_29_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_29_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13876,9 +14044,10 @@ define <4 x i64> @ugt_29_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_29_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -13958,9 +14127,10 @@ define <4 x i64> @ult_30_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_30_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14040,9 +14210,10 @@ define <4 x i64> @ugt_30_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_30_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14122,9 +14293,10 @@ define <4 x i64> @ult_31_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_31_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14204,9 +14376,10 @@ define <4 x i64> @ugt_31_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_31_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14286,9 +14459,10 @@ define <4 x i64> @ult_32_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_32_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14368,9 +14542,10 @@ define <4 x i64> @ugt_32_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_32_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14450,9 +14625,10 @@ define <4 x i64> @ult_33_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_33_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14532,9 +14708,10 @@ define <4 x i64> @ugt_33_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_33_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14614,9 +14791,10 @@ define <4 x i64> @ult_34_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_34_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14696,9 +14874,10 @@ define <4 x i64> @ugt_34_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_34_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14778,9 +14957,10 @@ define <4 x i64> @ult_35_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_35_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14860,9 +15040,10 @@ define <4 x i64> @ugt_35_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_35_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -14942,9 +15123,10 @@ define <4 x i64> @ult_36_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_36_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15024,9 +15206,10 @@ define <4 x i64> @ugt_36_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_36_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15106,9 +15289,10 @@ define <4 x i64> @ult_37_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_37_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15188,9 +15372,10 @@ define <4 x i64> @ugt_37_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_37_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15270,9 +15455,10 @@ define <4 x i64> @ult_38_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_38_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15352,9 +15538,10 @@ define <4 x i64> @ugt_38_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_38_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15434,9 +15621,10 @@ define <4 x i64> @ult_39_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_39_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15516,9 +15704,10 @@ define <4 x i64> @ugt_39_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_39_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15598,9 +15787,10 @@ define <4 x i64> @ult_40_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_40_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15680,9 +15870,10 @@ define <4 x i64> @ugt_40_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_40_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15762,9 +15953,10 @@ define <4 x i64> @ult_41_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_41_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15844,9 +16036,10 @@ define <4 x i64> @ugt_41_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_41_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -15926,9 +16119,10 @@ define <4 x i64> @ult_42_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_42_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16008,9 +16202,10 @@ define <4 x i64> @ugt_42_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_42_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16090,9 +16285,10 @@ define <4 x i64> @ult_43_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_43_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16172,9 +16368,10 @@ define <4 x i64> @ugt_43_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_43_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16254,9 +16451,10 @@ define <4 x i64> @ult_44_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_44_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16336,9 +16534,10 @@ define <4 x i64> @ugt_44_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_44_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16418,9 +16617,10 @@ define <4 x i64> @ult_45_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_45_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16500,9 +16700,10 @@ define <4 x i64> @ugt_45_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_45_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16582,9 +16783,10 @@ define <4 x i64> @ult_46_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_46_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16664,9 +16866,10 @@ define <4 x i64> @ugt_46_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_46_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16746,9 +16949,10 @@ define <4 x i64> @ult_47_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_47_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16828,9 +17032,10 @@ define <4 x i64> @ugt_47_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_47_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16910,9 +17115,10 @@ define <4 x i64> @ult_48_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_48_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -16992,9 +17198,10 @@ define <4 x i64> @ugt_48_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_48_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17074,9 +17281,10 @@ define <4 x i64> @ult_49_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_49_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17156,9 +17364,10 @@ define <4 x i64> @ugt_49_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_49_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17238,9 +17447,10 @@ define <4 x i64> @ult_50_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_50_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17320,9 +17530,10 @@ define <4 x i64> @ugt_50_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_50_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17402,9 +17613,10 @@ define <4 x i64> @ult_51_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_51_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17484,9 +17696,10 @@ define <4 x i64> @ugt_51_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_51_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17566,9 +17779,10 @@ define <4 x i64> @ult_52_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_52_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17648,9 +17862,10 @@ define <4 x i64> @ugt_52_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_52_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17730,9 +17945,10 @@ define <4 x i64> @ult_53_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_53_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17812,9 +18028,10 @@ define <4 x i64> @ugt_53_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_53_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17894,9 +18111,10 @@ define <4 x i64> @ult_54_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_54_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -17976,9 +18194,10 @@ define <4 x i64> @ugt_54_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_54_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18058,9 +18277,10 @@ define <4 x i64> @ult_55_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_55_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18140,9 +18360,10 @@ define <4 x i64> @ugt_55_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_55_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18222,9 +18443,10 @@ define <4 x i64> @ult_56_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_56_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18304,9 +18526,10 @@ define <4 x i64> @ugt_56_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_56_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18386,9 +18609,10 @@ define <4 x i64> @ult_57_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_57_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18468,9 +18692,10 @@ define <4 x i64> @ugt_57_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_57_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18550,9 +18775,10 @@ define <4 x i64> @ult_58_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_58_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18632,9 +18858,10 @@ define <4 x i64> @ugt_58_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_58_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18714,9 +18941,10 @@ define <4 x i64> @ult_59_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_59_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18796,9 +19024,10 @@ define <4 x i64> @ugt_59_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_59_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18878,9 +19107,10 @@ define <4 x i64> @ult_60_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_60_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -18960,9 +19190,10 @@ define <4 x i64> @ugt_60_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_60_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19042,9 +19273,10 @@ define <4 x i64> @ult_61_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_61_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19124,9 +19356,10 @@ define <4 x i64> @ugt_61_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_61_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19206,9 +19439,10 @@ define <4 x i64> @ult_62_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_62_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19288,9 +19522,10 @@ define <4 x i64> @ugt_62_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ugt_62_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -19370,9 +19605,10 @@ define <4 x i64> @ult_63_v4i64(<4 x i64> %0) { ; ; AVX2-LABEL: ult_63_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll index c35acdcf857d7..568c121409166 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -33,9 +33,10 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; AVX2-LABEL: testv4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -131,9 +132,10 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX2-LABEL: testv8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -244,9 +246,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX2-LABEL: testv16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -334,9 +337,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX2-LABEL: testv32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -365,9 +369,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -377,9 +382,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll index ef17822b1cf9b..182415f0ae5e2 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll @@ -125,9 +125,10 @@ define <64 x i8> @ult_2_v64i8(<64 x i8> %0) { define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_2_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -140,7 +141,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -148,9 +149,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_2_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -162,9 +164,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_2_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -177,7 +180,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -185,9 +188,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -212,9 +216,10 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_3_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -227,7 +232,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -235,9 +240,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_3_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -249,9 +255,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_3_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -264,7 +271,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -272,9 +279,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -299,9 +307,10 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_3_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -314,7 +323,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -322,9 +331,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_3_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -336,9 +346,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_3_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -351,7 +362,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -359,9 +370,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -386,9 +398,10 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_4_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -401,7 +414,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -409,9 +422,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_4_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -423,9 +437,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_4_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -438,7 +453,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -446,9 +461,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -473,9 +489,10 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_4_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -488,7 +505,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -496,9 +513,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_4_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -510,9 +528,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_4_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -525,7 +544,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -533,9 +552,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -560,9 +580,10 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_5_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -575,7 +596,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -583,9 +604,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_5_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -597,9 +619,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_5_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -612,7 +635,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -620,9 +643,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -647,9 +671,10 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_5_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -662,7 +687,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -670,9 +695,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_5_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -684,9 +710,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_5_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -699,7 +726,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -707,9 +734,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -734,9 +762,10 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_6_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -749,7 +778,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -757,9 +786,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_6_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -771,9 +801,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_6_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -786,7 +817,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -794,9 +825,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -821,9 +853,10 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_6_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -836,7 +869,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -844,9 +877,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ugt_6_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -858,9 +892,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_6_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -873,7 +908,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -881,9 +916,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -908,9 +944,10 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_7_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -923,7 +960,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -931,9 +968,10 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; ; AVX512BW-LABEL: ult_7_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -945,9 +983,10 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_7_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -960,7 +999,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -968,9 +1007,10 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1111,9 +1151,10 @@ define <32 x i16> @ult_2_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_2_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1132,7 +1173,7 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1140,9 +1181,10 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_2_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1164,7 +1206,7 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1172,9 +1214,10 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_2_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1202,9 +1245,10 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_3_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1223,7 +1267,7 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1231,9 +1275,10 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_3_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1255,7 +1300,7 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1263,9 +1308,10 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_3_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1293,9 +1339,10 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_3_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1314,7 +1361,7 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1322,9 +1369,10 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_3_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1346,7 +1394,7 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1354,9 +1402,10 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_3_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1384,9 +1433,10 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_4_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1405,7 +1455,7 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1413,9 +1463,10 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_4_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1437,7 +1488,7 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1445,9 +1496,10 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_4_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1475,9 +1527,10 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_4_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1496,7 +1549,7 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1504,9 +1557,10 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_4_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1528,7 +1582,7 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1536,9 +1590,10 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_4_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1566,9 +1621,10 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_5_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1587,7 +1643,7 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1595,9 +1651,10 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_5_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1619,7 +1676,7 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1627,9 +1684,10 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_5_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1657,9 +1715,10 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_5_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1678,7 +1737,7 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1686,9 +1745,10 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_5_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1710,7 +1770,7 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1718,9 +1778,10 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_5_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1748,9 +1809,10 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_6_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1769,7 +1831,7 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1777,9 +1839,10 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_6_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1801,7 +1864,7 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1809,9 +1872,10 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_6_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1839,9 +1903,10 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_6_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1860,7 +1925,7 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1868,9 +1933,10 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_6_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1892,7 +1958,7 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1900,9 +1966,10 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_6_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1930,9 +1997,10 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_7_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -1951,7 +2019,7 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1959,9 +2027,10 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_7_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -1983,7 +2052,7 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1991,9 +2060,10 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_7_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2021,9 +2091,10 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_7_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2042,7 +2113,7 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2050,9 +2121,10 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_7_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2074,7 +2146,7 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2082,9 +2154,10 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_7_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2112,9 +2185,10 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_8_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2133,7 +2207,7 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2141,9 +2215,10 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_8_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2165,7 +2240,7 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2173,9 +2248,10 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_8_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2203,9 +2279,10 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_8_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2224,7 +2301,7 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2232,9 +2309,10 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_8_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2256,7 +2334,7 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2264,9 +2342,10 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_8_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2294,9 +2373,10 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_9_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2315,7 +2395,7 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2323,9 +2403,10 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_9_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2347,7 +2428,7 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2355,9 +2436,10 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_9_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2385,9 +2467,10 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_9_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2406,7 +2489,7 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2414,9 +2497,10 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_9_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2438,7 +2522,7 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2446,9 +2530,10 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_9_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2476,9 +2561,10 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_10_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2497,7 +2583,7 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2505,9 +2591,10 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_10_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2529,7 +2616,7 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2537,9 +2624,10 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_10_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2567,9 +2655,10 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_10_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2588,7 +2677,7 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2596,9 +2685,10 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_10_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2620,7 +2710,7 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2628,9 +2718,10 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_10_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2658,9 +2749,10 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_11_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2679,7 +2771,7 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2687,9 +2779,10 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_11_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2711,7 +2804,7 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2719,9 +2812,10 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_11_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2749,9 +2843,10 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_11_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2770,7 +2865,7 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2778,9 +2873,10 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_11_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2802,7 +2898,7 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2810,9 +2906,10 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_11_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2840,9 +2937,10 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_12_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2861,7 +2959,7 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2869,9 +2967,10 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_12_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2893,7 +2992,7 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2901,9 +3000,10 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_12_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2931,9 +3031,10 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_12_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -2952,7 +3053,7 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2960,9 +3061,10 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_12_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -2984,7 +3086,7 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2992,9 +3094,10 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_12_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3022,9 +3125,10 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_13_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3043,7 +3147,7 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3051,9 +3155,10 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_13_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3075,7 +3180,7 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3083,9 +3188,10 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_13_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3113,9 +3219,10 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_13_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3134,7 +3241,7 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3142,9 +3249,10 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_13_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3166,7 +3274,7 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3174,9 +3282,10 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_13_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3204,9 +3313,10 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_14_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3225,7 +3335,7 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3233,9 +3343,10 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_14_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3257,7 +3368,7 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3265,9 +3376,10 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_14_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3295,9 +3407,10 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_14_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3316,7 +3429,7 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3324,9 +3437,10 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ugt_14_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3348,7 +3462,7 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3356,9 +3470,10 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ugt_14_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3386,9 +3501,10 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_15_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -3407,7 +3523,7 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3415,9 +3531,10 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; ; AVX512BW-LABEL: ult_15_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3439,7 +3556,7 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3447,9 +3564,10 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; ; AVX512VPOPCNTDQ-BW-LABEL: ult_15_v32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3552,9 +3670,10 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_2_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3584,9 +3703,10 @@ define <16 x i32> @ugt_2_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_2_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3631,9 +3751,10 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_3_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3663,9 +3784,10 @@ define <16 x i32> @ult_3_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_3_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3710,9 +3832,10 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_3_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3742,9 +3865,10 @@ define <16 x i32> @ugt_3_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_3_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3789,9 +3913,10 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_4_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3821,9 +3946,10 @@ define <16 x i32> @ult_4_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_4_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3868,9 +3994,10 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_4_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3900,9 +4027,10 @@ define <16 x i32> @ugt_4_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_4_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -3947,9 +4075,10 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_5_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -3979,9 +4108,10 @@ define <16 x i32> @ult_5_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_5_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4026,9 +4156,10 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_5_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4058,9 +4189,10 @@ define <16 x i32> @ugt_5_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_5_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4105,9 +4237,10 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_6_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4137,9 +4270,10 @@ define <16 x i32> @ult_6_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_6_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4184,9 +4318,10 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_6_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4216,9 +4351,10 @@ define <16 x i32> @ugt_6_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_6_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4263,9 +4399,10 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_7_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4295,9 +4432,10 @@ define <16 x i32> @ult_7_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_7_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4342,9 +4480,10 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_7_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4374,9 +4513,10 @@ define <16 x i32> @ugt_7_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_7_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4421,9 +4561,10 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_8_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4453,9 +4594,10 @@ define <16 x i32> @ult_8_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_8_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4500,9 +4642,10 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_8_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4532,9 +4675,10 @@ define <16 x i32> @ugt_8_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_8_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4579,9 +4723,10 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_9_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4611,9 +4756,10 @@ define <16 x i32> @ult_9_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_9_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4658,9 +4804,10 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_9_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4690,9 +4837,10 @@ define <16 x i32> @ugt_9_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_9_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4737,9 +4885,10 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_10_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4769,9 +4918,10 @@ define <16 x i32> @ult_10_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_10_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4816,9 +4966,10 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_10_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4848,9 +4999,10 @@ define <16 x i32> @ugt_10_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_10_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4895,9 +5047,10 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_11_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -4927,9 +5080,10 @@ define <16 x i32> @ult_11_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_11_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -4974,9 +5128,10 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_11_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5006,9 +5161,10 @@ define <16 x i32> @ugt_11_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_11_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5053,9 +5209,10 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_12_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5085,9 +5242,10 @@ define <16 x i32> @ult_12_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_12_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5132,9 +5290,10 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_12_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5164,9 +5323,10 @@ define <16 x i32> @ugt_12_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_12_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5211,9 +5371,10 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_13_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5243,9 +5404,10 @@ define <16 x i32> @ult_13_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_13_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5290,9 +5452,10 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_13_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5322,9 +5485,10 @@ define <16 x i32> @ugt_13_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_13_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5369,9 +5533,10 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_14_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5401,9 +5566,10 @@ define <16 x i32> @ult_14_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_14_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5448,9 +5614,10 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_14_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5480,9 +5647,10 @@ define <16 x i32> @ugt_14_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_14_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5527,9 +5695,10 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_15_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5559,9 +5728,10 @@ define <16 x i32> @ult_15_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_15_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5606,9 +5776,10 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_15_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5638,9 +5809,10 @@ define <16 x i32> @ugt_15_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_15_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5685,9 +5857,10 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_16_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5717,9 +5890,10 @@ define <16 x i32> @ult_16_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_16_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5764,9 +5938,10 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_16_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5796,9 +5971,10 @@ define <16 x i32> @ugt_16_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_16_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5843,9 +6019,10 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_17_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5875,9 +6052,10 @@ define <16 x i32> @ult_17_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_17_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -5922,9 +6100,10 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_17_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -5954,9 +6133,10 @@ define <16 x i32> @ugt_17_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_17_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6001,9 +6181,10 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_18_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6033,9 +6214,10 @@ define <16 x i32> @ult_18_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_18_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6080,9 +6262,10 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_18_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6112,9 +6295,10 @@ define <16 x i32> @ugt_18_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_18_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6159,9 +6343,10 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_19_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6191,9 +6376,10 @@ define <16 x i32> @ult_19_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_19_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6238,9 +6424,10 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_19_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6270,9 +6457,10 @@ define <16 x i32> @ugt_19_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_19_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6317,9 +6505,10 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_20_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6349,9 +6538,10 @@ define <16 x i32> @ult_20_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_20_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6396,9 +6586,10 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_20_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6428,9 +6619,10 @@ define <16 x i32> @ugt_20_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_20_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6475,9 +6667,10 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_21_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6507,9 +6700,10 @@ define <16 x i32> @ult_21_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_21_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6554,9 +6748,10 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_21_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6586,9 +6781,10 @@ define <16 x i32> @ugt_21_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_21_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6633,9 +6829,10 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_22_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6665,9 +6862,10 @@ define <16 x i32> @ult_22_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_22_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6712,9 +6910,10 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_22_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6744,9 +6943,10 @@ define <16 x i32> @ugt_22_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_22_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6791,9 +6991,10 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_23_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6823,9 +7024,10 @@ define <16 x i32> @ult_23_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_23_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6870,9 +7072,10 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_23_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6902,9 +7105,10 @@ define <16 x i32> @ugt_23_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_23_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -6949,9 +7153,10 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_24_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -6981,9 +7186,10 @@ define <16 x i32> @ult_24_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_24_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7028,9 +7234,10 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_24_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7060,9 +7267,10 @@ define <16 x i32> @ugt_24_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_24_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7107,9 +7315,10 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_25_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7139,9 +7348,10 @@ define <16 x i32> @ult_25_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_25_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7186,9 +7396,10 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_25_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7218,9 +7429,10 @@ define <16 x i32> @ugt_25_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_25_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7265,9 +7477,10 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_26_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7297,9 +7510,10 @@ define <16 x i32> @ult_26_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_26_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7344,9 +7558,10 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_26_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7376,9 +7591,10 @@ define <16 x i32> @ugt_26_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_26_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7423,9 +7639,10 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_27_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7455,9 +7672,10 @@ define <16 x i32> @ult_27_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_27_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7502,9 +7720,10 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_27_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7534,9 +7753,10 @@ define <16 x i32> @ugt_27_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_27_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7581,9 +7801,10 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_28_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7613,9 +7834,10 @@ define <16 x i32> @ult_28_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_28_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7660,9 +7882,10 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_28_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7692,9 +7915,10 @@ define <16 x i32> @ugt_28_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_28_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7739,9 +7963,10 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_29_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7771,9 +7996,10 @@ define <16 x i32> @ult_29_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_29_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7818,9 +8044,10 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_29_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7850,9 +8077,10 @@ define <16 x i32> @ugt_29_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_29_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7897,9 +8125,10 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_30_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -7929,9 +8158,10 @@ define <16 x i32> @ult_30_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_30_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -7976,9 +8206,10 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ugt_30_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8008,9 +8239,10 @@ define <16 x i32> @ugt_30_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ugt_30_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8055,9 +8287,10 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) { ; AVX512F-LABEL: ult_31_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8087,9 +8320,10 @@ define <16 x i32> @ult_31_v16i32(<16 x i32> %0) { ; ; AVX512BW-LABEL: ult_31_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8208,9 +8442,10 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_2_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8232,9 +8467,10 @@ define <8 x i64> @ugt_2_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_2_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8271,9 +8507,10 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_3_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8295,9 +8532,10 @@ define <8 x i64> @ult_3_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_3_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8334,9 +8572,10 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_3_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8358,9 +8597,10 @@ define <8 x i64> @ugt_3_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_3_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8397,9 +8637,10 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_4_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8421,9 +8662,10 @@ define <8 x i64> @ult_4_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_4_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8460,9 +8702,10 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_4_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8484,9 +8727,10 @@ define <8 x i64> @ugt_4_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_4_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8523,9 +8767,10 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_5_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8547,9 +8792,10 @@ define <8 x i64> @ult_5_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_5_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8586,9 +8832,10 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_5_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8610,9 +8857,10 @@ define <8 x i64> @ugt_5_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_5_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8649,9 +8897,10 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_6_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8673,9 +8922,10 @@ define <8 x i64> @ult_6_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_6_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8712,9 +8962,10 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_6_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8736,9 +8987,10 @@ define <8 x i64> @ugt_6_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_6_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8775,9 +9027,10 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_7_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8799,9 +9052,10 @@ define <8 x i64> @ult_7_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_7_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8838,9 +9092,10 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_7_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8862,9 +9117,10 @@ define <8 x i64> @ugt_7_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_7_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8901,9 +9157,10 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_8_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8925,9 +9182,10 @@ define <8 x i64> @ult_8_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_8_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -8964,9 +9222,10 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_8_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -8988,9 +9247,10 @@ define <8 x i64> @ugt_8_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_8_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9027,9 +9287,10 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_9_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9051,9 +9312,10 @@ define <8 x i64> @ult_9_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_9_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9090,9 +9352,10 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_9_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9114,9 +9377,10 @@ define <8 x i64> @ugt_9_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_9_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9153,9 +9417,10 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_10_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9177,9 +9442,10 @@ define <8 x i64> @ult_10_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_10_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9216,9 +9482,10 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_10_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9240,9 +9507,10 @@ define <8 x i64> @ugt_10_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_10_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9279,9 +9547,10 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_11_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9303,9 +9572,10 @@ define <8 x i64> @ult_11_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_11_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9342,9 +9612,10 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_11_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9366,9 +9637,10 @@ define <8 x i64> @ugt_11_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_11_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9405,9 +9677,10 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_12_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9429,9 +9702,10 @@ define <8 x i64> @ult_12_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_12_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9468,9 +9742,10 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_12_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9492,9 +9767,10 @@ define <8 x i64> @ugt_12_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_12_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9531,9 +9807,10 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_13_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9555,9 +9832,10 @@ define <8 x i64> @ult_13_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_13_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9594,9 +9872,10 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_13_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9618,9 +9897,10 @@ define <8 x i64> @ugt_13_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_13_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9657,9 +9937,10 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_14_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9681,9 +9962,10 @@ define <8 x i64> @ult_14_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_14_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9720,9 +10002,10 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_14_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9744,9 +10027,10 @@ define <8 x i64> @ugt_14_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_14_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9783,9 +10067,10 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_15_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9807,9 +10092,10 @@ define <8 x i64> @ult_15_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_15_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9846,9 +10132,10 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_15_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9870,9 +10157,10 @@ define <8 x i64> @ugt_15_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_15_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9909,9 +10197,10 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_16_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9933,9 +10222,10 @@ define <8 x i64> @ult_16_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_16_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -9972,9 +10262,10 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_16_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -9996,9 +10287,10 @@ define <8 x i64> @ugt_16_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_16_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10035,9 +10327,10 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_17_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10059,9 +10352,10 @@ define <8 x i64> @ult_17_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_17_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10098,9 +10392,10 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_17_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10122,9 +10417,10 @@ define <8 x i64> @ugt_17_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_17_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10161,9 +10457,10 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_18_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10185,9 +10482,10 @@ define <8 x i64> @ult_18_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_18_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10224,9 +10522,10 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_18_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10248,9 +10547,10 @@ define <8 x i64> @ugt_18_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_18_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10287,9 +10587,10 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_19_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10311,9 +10612,10 @@ define <8 x i64> @ult_19_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_19_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10350,9 +10652,10 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_19_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10374,9 +10677,10 @@ define <8 x i64> @ugt_19_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_19_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10413,9 +10717,10 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_20_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10437,9 +10742,10 @@ define <8 x i64> @ult_20_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_20_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10476,9 +10782,10 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_20_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10500,9 +10807,10 @@ define <8 x i64> @ugt_20_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_20_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10539,9 +10847,10 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_21_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10563,9 +10872,10 @@ define <8 x i64> @ult_21_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_21_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10602,9 +10912,10 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_21_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10626,9 +10937,10 @@ define <8 x i64> @ugt_21_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_21_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10665,9 +10977,10 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_22_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10689,9 +11002,10 @@ define <8 x i64> @ult_22_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_22_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10728,9 +11042,10 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_22_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10752,9 +11067,10 @@ define <8 x i64> @ugt_22_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_22_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10791,9 +11107,10 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_23_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10815,9 +11132,10 @@ define <8 x i64> @ult_23_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_23_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10854,9 +11172,10 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_23_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10878,9 +11197,10 @@ define <8 x i64> @ugt_23_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_23_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10917,9 +11237,10 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_24_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -10941,9 +11262,10 @@ define <8 x i64> @ult_24_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_24_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -10980,9 +11302,10 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_24_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11004,9 +11327,10 @@ define <8 x i64> @ugt_24_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_24_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11043,9 +11367,10 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_25_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11067,9 +11392,10 @@ define <8 x i64> @ult_25_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_25_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11106,9 +11432,10 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_25_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11130,9 +11457,10 @@ define <8 x i64> @ugt_25_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_25_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11169,9 +11497,10 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_26_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11193,9 +11522,10 @@ define <8 x i64> @ult_26_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_26_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11232,9 +11562,10 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_26_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11256,9 +11587,10 @@ define <8 x i64> @ugt_26_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_26_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11295,9 +11627,10 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_27_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11319,9 +11652,10 @@ define <8 x i64> @ult_27_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_27_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11358,9 +11692,10 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_27_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11382,9 +11717,10 @@ define <8 x i64> @ugt_27_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_27_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11421,9 +11757,10 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_28_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11445,9 +11782,10 @@ define <8 x i64> @ult_28_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_28_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11484,9 +11822,10 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_28_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11508,9 +11847,10 @@ define <8 x i64> @ugt_28_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_28_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11547,9 +11887,10 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_29_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11571,9 +11912,10 @@ define <8 x i64> @ult_29_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_29_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11610,9 +11952,10 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_29_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11634,9 +11977,10 @@ define <8 x i64> @ugt_29_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_29_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11673,9 +12017,10 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_30_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11697,9 +12042,10 @@ define <8 x i64> @ult_30_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_30_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11736,9 +12082,10 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_30_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11760,9 +12107,10 @@ define <8 x i64> @ugt_30_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_30_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11799,9 +12147,10 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_31_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11823,9 +12172,10 @@ define <8 x i64> @ult_31_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_31_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11862,9 +12212,10 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_31_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11886,9 +12237,10 @@ define <8 x i64> @ugt_31_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_31_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11925,9 +12277,10 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_32_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -11949,9 +12302,10 @@ define <8 x i64> @ult_32_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_32_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -11988,9 +12342,10 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_32_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12012,9 +12367,10 @@ define <8 x i64> @ugt_32_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_32_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12051,9 +12407,10 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_33_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12075,9 +12432,10 @@ define <8 x i64> @ult_33_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_33_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12114,9 +12472,10 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_33_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12138,9 +12497,10 @@ define <8 x i64> @ugt_33_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_33_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12177,9 +12537,10 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_34_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12201,9 +12562,10 @@ define <8 x i64> @ult_34_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_34_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12240,9 +12602,10 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_34_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12264,9 +12627,10 @@ define <8 x i64> @ugt_34_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_34_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12303,9 +12667,10 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_35_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12327,9 +12692,10 @@ define <8 x i64> @ult_35_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_35_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12366,9 +12732,10 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_35_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12390,9 +12757,10 @@ define <8 x i64> @ugt_35_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_35_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12429,9 +12797,10 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_36_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12453,9 +12822,10 @@ define <8 x i64> @ult_36_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_36_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12492,9 +12862,10 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_36_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12516,9 +12887,10 @@ define <8 x i64> @ugt_36_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_36_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12555,9 +12927,10 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_37_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12579,9 +12952,10 @@ define <8 x i64> @ult_37_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_37_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12618,9 +12992,10 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_37_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12642,9 +13017,10 @@ define <8 x i64> @ugt_37_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_37_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12681,9 +13057,10 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_38_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12705,9 +13082,10 @@ define <8 x i64> @ult_38_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_38_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12744,9 +13122,10 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_38_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12768,9 +13147,10 @@ define <8 x i64> @ugt_38_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_38_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12807,9 +13187,10 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_39_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12831,9 +13212,10 @@ define <8 x i64> @ult_39_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_39_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12870,9 +13252,10 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_39_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12894,9 +13277,10 @@ define <8 x i64> @ugt_39_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_39_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12933,9 +13317,10 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_40_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -12957,9 +13342,10 @@ define <8 x i64> @ult_40_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_40_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -12996,9 +13382,10 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_40_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13020,9 +13407,10 @@ define <8 x i64> @ugt_40_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_40_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13059,9 +13447,10 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_41_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13083,9 +13472,10 @@ define <8 x i64> @ult_41_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_41_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13122,9 +13512,10 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_41_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13146,9 +13537,10 @@ define <8 x i64> @ugt_41_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_41_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13185,9 +13577,10 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_42_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13209,9 +13602,10 @@ define <8 x i64> @ult_42_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_42_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13248,9 +13642,10 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_42_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13272,9 +13667,10 @@ define <8 x i64> @ugt_42_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_42_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13311,9 +13707,10 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_43_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13335,9 +13732,10 @@ define <8 x i64> @ult_43_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_43_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13374,9 +13772,10 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_43_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13398,9 +13797,10 @@ define <8 x i64> @ugt_43_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_43_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13437,9 +13837,10 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_44_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13461,9 +13862,10 @@ define <8 x i64> @ult_44_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_44_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13500,9 +13902,10 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_44_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13524,9 +13927,10 @@ define <8 x i64> @ugt_44_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_44_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13563,9 +13967,10 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_45_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13587,9 +13992,10 @@ define <8 x i64> @ult_45_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_45_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13626,9 +14032,10 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_45_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13650,9 +14057,10 @@ define <8 x i64> @ugt_45_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_45_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13689,9 +14097,10 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_46_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13713,9 +14122,10 @@ define <8 x i64> @ult_46_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_46_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13752,9 +14162,10 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_46_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13776,9 +14187,10 @@ define <8 x i64> @ugt_46_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_46_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13815,9 +14227,10 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_47_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13839,9 +14252,10 @@ define <8 x i64> @ult_47_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_47_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13878,9 +14292,10 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_47_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13902,9 +14317,10 @@ define <8 x i64> @ugt_47_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_47_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -13941,9 +14357,10 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_48_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -13965,9 +14382,10 @@ define <8 x i64> @ult_48_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_48_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14004,9 +14422,10 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_48_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14028,9 +14447,10 @@ define <8 x i64> @ugt_48_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_48_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14067,9 +14487,10 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_49_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14091,9 +14512,10 @@ define <8 x i64> @ult_49_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_49_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14130,9 +14552,10 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_49_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14154,9 +14577,10 @@ define <8 x i64> @ugt_49_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_49_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14193,9 +14617,10 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_50_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14217,9 +14642,10 @@ define <8 x i64> @ult_50_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_50_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14256,9 +14682,10 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_50_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14280,9 +14707,10 @@ define <8 x i64> @ugt_50_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_50_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14319,9 +14747,10 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_51_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14343,9 +14772,10 @@ define <8 x i64> @ult_51_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_51_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14382,9 +14812,10 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_51_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14406,9 +14837,10 @@ define <8 x i64> @ugt_51_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_51_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14445,9 +14877,10 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_52_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14469,9 +14902,10 @@ define <8 x i64> @ult_52_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_52_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14508,9 +14942,10 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_52_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14532,9 +14967,10 @@ define <8 x i64> @ugt_52_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_52_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14571,9 +15007,10 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_53_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14595,9 +15032,10 @@ define <8 x i64> @ult_53_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_53_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14634,9 +15072,10 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_53_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14658,9 +15097,10 @@ define <8 x i64> @ugt_53_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_53_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14697,9 +15137,10 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_54_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14721,9 +15162,10 @@ define <8 x i64> @ult_54_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_54_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14760,9 +15202,10 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_54_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14784,9 +15227,10 @@ define <8 x i64> @ugt_54_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_54_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14823,9 +15267,10 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_55_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14847,9 +15292,10 @@ define <8 x i64> @ult_55_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_55_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14886,9 +15332,10 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_55_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14910,9 +15357,10 @@ define <8 x i64> @ugt_55_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_55_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -14949,9 +15397,10 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_56_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -14973,9 +15422,10 @@ define <8 x i64> @ult_56_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_56_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15012,9 +15462,10 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_56_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15036,9 +15487,10 @@ define <8 x i64> @ugt_56_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_56_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15075,9 +15527,10 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_57_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15099,9 +15552,10 @@ define <8 x i64> @ult_57_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_57_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15138,9 +15592,10 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_57_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15162,9 +15617,10 @@ define <8 x i64> @ugt_57_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_57_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15201,9 +15657,10 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_58_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15225,9 +15682,10 @@ define <8 x i64> @ult_58_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_58_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15264,9 +15722,10 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_58_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15288,9 +15747,10 @@ define <8 x i64> @ugt_58_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_58_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15327,9 +15787,10 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_59_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15351,9 +15812,10 @@ define <8 x i64> @ult_59_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_59_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15390,9 +15852,10 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_59_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15414,9 +15877,10 @@ define <8 x i64> @ugt_59_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_59_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15453,9 +15917,10 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_60_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15477,9 +15942,10 @@ define <8 x i64> @ult_60_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_60_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15516,9 +15982,10 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_60_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15540,9 +16007,10 @@ define <8 x i64> @ugt_60_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_60_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15579,9 +16047,10 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_61_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15603,9 +16072,10 @@ define <8 x i64> @ult_61_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_61_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15642,9 +16112,10 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_61_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15666,9 +16137,10 @@ define <8 x i64> @ugt_61_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_61_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15705,9 +16177,10 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_62_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15729,9 +16202,10 @@ define <8 x i64> @ult_62_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_62_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15768,9 +16242,10 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ugt_62_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15792,9 +16267,10 @@ define <8 x i64> @ugt_62_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ugt_62_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -15831,9 +16307,10 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) { ; AVX512F-LABEL: ult_63_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -15855,9 +16332,10 @@ define <8 x i64> @ult_63_v8i64(<8 x i64> %0) { ; ; AVX512BW-LABEL: ult_63_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll index 552a27daf971a..1c1caf8ee4681 100644 --- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll @@ -9,9 +9,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512F-LABEL: testv8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -31,9 +32,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; ; AVX512BW-LABEL: testv8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -62,9 +64,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512F-LABEL: testv16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -92,9 +95,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; ; AVX512BW-LABEL: testv16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -130,9 +134,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512F-LABEL: testv32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 @@ -156,9 +161,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512BW-LABEL: testv32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -183,9 +189,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-BW-LABEL: testv32i16: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -208,9 +215,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512F-LABEL: testv64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -227,9 +235,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512BW-LABEL: testv64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -240,9 +249,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -259,9 +269,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-BW-LABEL: testv64i8: ; AVX512VPOPCNTDQ-BW: # %bb.0: -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index 320b63ee20bd5..03e39e71aaaf1 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -914,7 +914,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) { ; AVX512BW-LABEL: test_v4i16_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -1252,7 +1252,7 @@ define i16 @test_v64i16_v64i8(<64 x i16> %a0) { ; ; AVX2-LABEL: test_v64i16_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index 8814cd592a8e8..bcb3f4d14b10a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -439,15 +439,35 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v32i16_v32i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v32i16_v32i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v32i16_v32i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v32i16_v32i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <32 x i16> %0 to <32 x i1> %b = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> %a) ret i1 %b @@ -490,15 +510,35 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512-LABEL: trunc_v64i8_v64i1: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 -; AVX512-NEXT: kortestw %k0, %k0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: trunc_v64i8_v64i1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512F-NEXT: kortestw %k0, %k0 +; AVX512F-NEXT: sete %al +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: trunc_v64i8_v64i1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512BW-NEXT: kortestw %k0, %k0 +; AVX512BW-NEXT: sete %al +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: trunc_v64i8_v64i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 +; AVX512VL-NEXT: kortestw %k0, %k0 +; AVX512VL-NEXT: sete %al +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %a = trunc <64 x i8> %0 to <64 x i1> %b = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> %a) ret i1 %b diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index 36d7dba44b94a..92c2ebc83b142 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -63,16 +63,27 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-NEXT: movq %xmm2, %rax ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index b745c97d5025b..5cade4eaaba63 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -63,16 +63,27 @@ define i64 @test_v2i64(<2 x i64> %a0) { ; SSE42-NEXT: movq %xmm2, %rax ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index 4ad271dc23706..5676836f8f7ea 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -469,7 +469,8 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7] -; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index c350531e11b8a..1f35d333de68e 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -73,7 +73,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: var_rotate_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 @@ -344,7 +344,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -355,7 +355,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -670,14 +670,23 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_rotate_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] -; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_rotate_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_rotate_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64] +; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512NOVLX-LABEL: splatvar_rotate_v2i64: ; AVX512NOVLX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 4dc837be572c9..7cfdc5d6513ff 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -259,7 +259,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -269,7 +269,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 ; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -532,7 +532,7 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX2-LABEL: splatvar_rotate_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -1107,11 +1107,13 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; ; AVX512BW-LABEL: constant_rotate_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 @@ -1131,11 +1133,13 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; ; AVX512VBMI2-LABEL: constant_rotate_v32i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0] +; AVX512VBMI2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm2, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512VBMI2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0] +; AVX512VBMI2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpsrlw $8, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 17f346138a92d..d3bb0e1deb9a1 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -38,7 +38,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512F-LABEL: var_rotate_v32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512F-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] @@ -67,7 +67,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512VL-LABEL: var_rotate_v32i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] @@ -97,7 +97,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -107,7 +107,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 @@ -147,7 +147,7 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 @@ -190,7 +190,7 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index df7a66a309ed7..8c03a1a03b880 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -66,7 +66,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; AVX2-LABEL: var_shift_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -632,14 +632,23 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_shift_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_shift_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v2i64: ; XOPAVX1: # %bb.0: @@ -837,7 +846,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -933,15 +942,25 @@ define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwi ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_modulo_shift_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_modulo_shift_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64: ; XOPAVX1: # %bb.0: @@ -1130,7 +1149,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1708,14 +1727,23 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v16i8: ; XOP: # %bb.0: @@ -1726,18 +1754,26 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: splatconstant_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: splatconstant_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatconstant_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v16i8: ; X86-SSE: # %bb.0: @@ -1762,15 +1798,25 @@ define <2 x i64> @PR52719(<2 x i64> %a0, i32 %a1) { ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: PR52719: -; AVX: # %bb.0: -; AVX-NEXT: vmovd %edi, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: PR52719: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR52719: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: PR52719: ; XOPAVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index 99c735dec13c0..a7d4e88af0e6d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -115,7 +115,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; X86-AVX2-LABEL: var_shift_v4i64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -723,7 +723,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; ; X86-AVX2-LABEL: splatvar_shift_v4i64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -888,7 +888,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -925,7 +925,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -943,7 +943,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -990,7 +990,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1084,7 +1084,7 @@ define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwi ; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -1252,7 +1252,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1291,7 +1291,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1310,7 +1310,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -1358,7 +1358,7 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1959,7 +1959,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1977,7 +1977,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -1986,18 +1986,26 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: splatconstant_shift_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 -; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: splatconstant_shift_v32i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512DQVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatconstant_shift_v32i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512BWVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: retq ; ; X86-AVX1-LABEL: splatconstant_shift_v32i8: ; X86-AVX1: # %bb.0: @@ -2019,7 +2027,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl @@ -2185,7 +2193,7 @@ define <4 x i64> @PR52719(<4 x i64> %a0, i32 %a1) { ; X86-AVX2-LABEL: PR52719: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index b6ad5306f5d1e..7ea94678e0b8e 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -197,7 +197,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 @@ -212,7 +212,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -286,7 +286,7 @@ define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwi ; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 @@ -301,7 +301,7 @@ define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwi ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 @@ -449,9 +449,9 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512DQ-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 @@ -464,7 +464,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512BW-LABEL: splatconstant_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index d501512201cd1..1fbdc3b45cf35 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1356,7 +1356,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1494,7 +1494,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1632,7 +1632,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1818,7 +1818,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1896,7 +1896,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -2308,14 +2308,23 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v8i8: ; XOP: # %bb.0: @@ -2326,18 +2335,26 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: splatconstant_shift_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: splatconstant_shift_v8i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatconstant_shift_v8i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v8i8: ; X86-SSE: # %bb.0: @@ -2361,14 +2378,23 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v4i8: ; XOP: # %bb.0: @@ -2379,18 +2405,26 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: splatconstant_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: splatconstant_shift_v4i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatconstant_shift_v4i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v4i8: ; X86-SSE: # %bb.0: @@ -2414,14 +2448,23 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i8: ; XOP: # %bb.0: @@ -2432,18 +2475,26 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; -; AVX512VL-LABEL: splatconstant_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX512DQVL-LABEL: splatconstant_shift_v2i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512DQVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512DQVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatconstant_shift_v2i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BWVL-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512BWVL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BWVL-NEXT: retq ; ; X86-SSE-LABEL: splatconstant_shift_v2i8: ; X86-SSE: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index f02849d61454a..f647208a8000e 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -53,18 +53,18 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsrlw $4, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $2, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $1, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index 9cab44b069fd4..510ae15ba0960 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1511,7 +1511,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1581,7 +1581,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 07902b4a86dee..dd63565a1dec7 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -1261,7 +1261,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1291,7 +1291,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1310,7 +1310,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQVL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1352,7 +1352,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; X86-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll index 92b60490f976d..8eca56d099feb 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -53,13 +53,13 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsllw $4, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsllw $2, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 @@ -301,7 +301,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] @@ -323,7 +323,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index bd7b250b3d8c5..e7600d272c66f 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -1339,7 +1339,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1399,7 +1399,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 4e9d17801f5ce..95320ecc340cd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -613,11 +613,17 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31( ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX512VL: # %bb.0: @@ -625,6 +631,18 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31( ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -653,11 +671,17 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31( ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; AVX512VL: # %bb.0: @@ -665,6 +689,18 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31( ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -707,11 +743,17 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31( ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; AVX512VL: # %bb.0: @@ -719,6 +761,18 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31( ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -910,7 +964,7 @@ define <16 x i8> @shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30( ; ; AVX2-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -966,7 +1020,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; ; AVX2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -974,7 +1028,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; ; AVX512VLBW-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] ; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1021,11 +1075,17 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: load_fold_pblendvb: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: load_fold_pblendvb: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_fold_pblendvb: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: load_fold_pblendvb: ; AVX512VL: # %bb.0: @@ -1033,6 +1093,18 @@ define <16 x i8> @load_fold_pblendvb(ptr %px, <16 x i8> %y) { ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: load_fold_pblendvb: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: load_fold_pblendvb: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; XOPAVX2-NEXT: retq %x = load <16 x i8>, ptr %px, align 16 %select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> ret <16 x i8> %select @@ -1065,11 +1137,17 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: load_fold_pblendvb_commute: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] -; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: load_fold_pblendvb_commute: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_fold_pblendvb_commute: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: load_fold_pblendvb_commute: ; AVX512VL: # %bb.0: @@ -1079,6 +1157,18 @@ define <16 x i8> @load_fold_pblendvb_commute(ptr %px, <16 x i8> %y) { ; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: load_fold_pblendvb_commute: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: load_fold_pblendvb_commute: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; XOPAVX2-NEXT: retq %x = load <16 x i8>, ptr %px, align 16 %select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> ret <16 x i8> %select @@ -2103,7 +2193,7 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { ; ; AVX2-LABEL: PR12412: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index e858c7cdbfa29..8cfeb2adba5d3 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -6844,7 +6844,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, ; ; AVX512VL-FAST-CROSSLANE-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: ; AVX512VL-FAST-CROSSLANE: # %bb.0: -; AVX512VL-FAST-CROSSLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-FAST-CROSSLANE-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-FAST-CROSSLANE-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-FAST-CROSSLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 7e7ba8b9ae65b..aebcb68f11c38 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2219,7 +2219,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_ ; ; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2237,7 +2237,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_ ; ; XOPAVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -2255,7 +2255,7 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_ ; ; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2273,7 +2273,7 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_ ; ; XOPAVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -2293,7 +2293,7 @@ define <32 x i8> @load_fold_pblendvb(ptr %px, <32 x i8> %y) { ; ; AVX2-LABEL: load_fold_pblendvb: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] ; AVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2312,7 +2312,7 @@ define <32 x i8> @load_fold_pblendvb(ptr %px, <32 x i8> %y) { ; ; XOPAVX2-LABEL: load_fold_pblendvb: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] ; XOPAVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %x = load <32 x i8>, ptr %px, align 32 @@ -2331,7 +2331,7 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) { ; ; AVX2-LABEL: load_fold_pblendvb_commute: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] ; AVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2353,7 +2353,7 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) { ; ; XOPAVX2-LABEL: load_fold_pblendvb_commute: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] ; XOPAVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %x = load <32 x i8>, ptr %px, align 32 @@ -4517,7 +4517,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_ ; ; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLVBMI-FAST-ALL-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-FAST-ALL-NEXT: retq ; @@ -4783,7 +4783,7 @@ define <32 x i8> @shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_ ; ; AVX512VL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index 6200187a14a03..4668d7b6870ef 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -29,7 +29,7 @@ define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0 ; ; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: ; SKX: ## %bb.0: -; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SKX-NEXT: vpbroadcastw {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index d44e584599246..843b285ae1c36 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -163,7 +163,8 @@ define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512F-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0 @@ -179,7 +180,8 @@ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_ ; ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 @@ -463,7 +465,8 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_ ; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] +; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -480,7 +483,8 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_ ; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -505,7 +509,8 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_ ; AVX512F-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] +; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -528,7 +533,8 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_ ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] +; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll index 90b5e70a0a302..8cc20ec3c1a7e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -455,7 +455,7 @@ define <32 x i8> @test_mm256_mask_blend_epi8(<32 x i8> %A, <32 x i8> %W){ ; ; AVX512F-LABEL: test_mm256_mask_blend_epi8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: ret{{[l|q]}} entry: @@ -473,7 +473,7 @@ define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){ ; ; AVX512F-LABEL: test_mm_mask_blend_epi8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: ret{{[l|q]}} entry: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index b2d813dd440a6..a5ba81d516f72 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -56,7 +56,8 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 ; X86-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} ; X86-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} @@ -66,7 +67,8 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; X64-LABEL: combine_pshufb_identity_mask: ; X64: # %bb.0: ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-NEXT: kmovq %rdi, %k1 ; X64-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 ; X64-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} @@ -157,14 +159,16 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64 ; X86-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vmovdqa64 {{.*#+}} zmm2 = <7,0,u,u,5,0,u,u,u,u,12,0,u,u,14,0> +; X86-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,0,12,0,5,0,14,0,7,0,12,0,5,0,14,0] +; X86-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; X86-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; X86-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63] ; X86-NEXT: retl ; ; X64-LABEL: combine_permi2q_pshufb_as_permi2d_mask: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = <7,u,5,u,u,12,u,14> +; X64-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [7,12,5,14,7,12,5,14] +; X64-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; X64-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 ; X64-NEXT: kmovq %rdi, %k1 ; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm2[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,20,21,22,23,20,21,22,23,20,21,22,23,20,21,22,23,40,41,42,43,40,41,42,43,40,41,42,43,40,41,42,43,60,61,62,63,60,61,62,63,60,61,62,63,60,61,62,63] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 1b9648e77162e..c6006a9b7493a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1749,13 +1749,21 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test1c: -; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_test1c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test1c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -1835,13 +1843,21 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_test4c: -; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_test4c: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test4c: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255] +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -3326,7 +3342,7 @@ define void @PR45604(ptr %dst, ptr %src) { ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u> ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll index f7132b1ea7d23..7159edc2bbdf4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -16,7 +16,8 @@ define <64 x i8> @f1(ptr %p0) { ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 @@ -106,7 +107,8 @@ define <64 x i8> @f1(ptr %p0) { ; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] @@ -158,7 +160,8 @@ define <64 x i8> @f2(ptr %p0) { ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15,1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 @@ -306,7 +309,8 @@ define <64 x i8> @f3(ptr %p0) { ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] +; AVX2-NEXT: # ymm6 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm0[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] @@ -340,7 +344,8 @@ define <64 x i8> @f3(ptr %p0) { ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] +; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpternlogq $216, %ymm5, %ymm2, %ymm0 @@ -390,7 +395,8 @@ define <64 x i8> @f3(ptr %p0) { ; AVX512BW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] @@ -441,7 +447,8 @@ define <64 x i8> @f4(ptr %p0) { ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14,0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll index 5f4572b8c3d88..ed9f849d35d00 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll @@ -38,7 +38,8 @@ define <32 x i8> @foo(ptr %x0) { ; AVX2-NEXT: vmovdqu 16(%rdi), %xmm2 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 37b996bfe686a..6c57956b3e29c 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -43,7 +43,8 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -483,7 +484,8 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -821,7 +823,8 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -1231,7 +1234,8 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -2157,7 +2161,8 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; ; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-ALL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index 4680e86cf73ad..6ecc3980329a5 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -107,16 +107,27 @@ define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -257,17 +268,29 @@ define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovq %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -514,7 +537,8 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX2-FAST-ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> +; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6] +; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-FAST-ALL-NEXT: vzeroupper @@ -1115,7 +1139,7 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1127,7 +1151,7 @@ define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1293,7 +1317,7 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1306,7 +1330,7 @@ define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -2801,16 +2825,27 @@ define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2955,17 +2990,29 @@ define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -3216,7 +3263,7 @@ define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3475,7 +3522,7 @@ define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index f6e4377f64fa7..c378281ac009e 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -111,16 +111,27 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -255,17 +266,29 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovlpd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -1126,10 +1149,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1138,10 +1161,10 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) { ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] @@ -1296,10 +1319,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1309,10 +1332,10 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2539,16 +2562,27 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2686,17 +2720,29 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -2948,7 +2994,7 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) { ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -3208,7 +3254,7 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index f687374baea4b..a830a96cdcb22 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -67,16 +67,27 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -165,17 +176,29 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovlpd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -748,7 +771,7 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) { ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] ; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -760,7 +783,7 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) { ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] @@ -872,7 +895,7 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) { ; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] ; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -885,7 +908,7 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) { ; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2005,16 +2028,27 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) { ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2106,17 +2140,29 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) { ; SSE41-NEXT: pextrw $0, %xmm2, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -2280,7 +2326,7 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) { ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -2452,7 +2498,7 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -4408,7 +4454,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; ; AVX2-LABEL: trunc_usat_v32i16_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vpminuw (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 @@ -4417,7 +4463,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; ; AVX512F-LABEL: trunc_usat_v32i16_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX512F-NEXT: vpminuw (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero @@ -4429,7 +4475,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(ptr %p0) { ; ; AVX512VL-LABEL: trunc_usat_v32i16_v32i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX512VL-NEXT: vpminuw (%rdi), %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index b5fa7312f7121..8c85c82b52ec6 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -582,7 +582,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) { ; AVX2-LABEL: trunc8i32_8i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1815,17 +1815,25 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc2x8i16_16i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc2x8i16_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc2x8i16_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc2x8i16_16i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1833,7 +1841,7 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { ; ; AVX512VL-LABEL: trunc2x8i16_16i8: ; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index caeb0015d4b52..0a19c166f936b 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -122,7 +122,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -150,7 +150,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -323,7 +323,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -351,7 +351,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -548,7 +548,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -789,7 +789,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -992,23 +992,77 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv8i16: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv8i16: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i16: ; AVX512VPOPCNTDQ: # %bb.0: @@ -1168,23 +1222,77 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16u: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16u: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16u: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv8i16u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv8i16u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i16u: ; AVX512VPOPCNTDQ: # %bb.0: @@ -1330,20 +1438,65 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv16i8: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv16i8: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv16i8: ; AVX512VPOPCNTDQ: # %bb.0: @@ -1485,20 +1638,65 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8u: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8u: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8u: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv16i8u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv16i8u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv16i8u: ; AVX512VPOPCNTDQ: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll index f97223b79cb0c..f2c84713d47ef 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -45,9 +45,10 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -119,9 +120,10 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -168,9 +170,10 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -242,9 +245,10 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -299,9 +303,10 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -385,9 +390,10 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -446,9 +452,10 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -532,9 +539,10 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -588,9 +596,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -606,9 +615,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -624,9 +634,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -679,9 +690,10 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -732,9 +744,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -750,9 +763,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -768,9 +782,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -823,9 +838,10 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -870,9 +886,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -885,9 +902,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -900,9 +918,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -915,9 +934,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -930,9 +950,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -962,9 +983,10 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1006,9 +1028,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1021,9 +1044,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1036,9 +1060,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1051,9 +1076,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1066,9 +1092,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQVL-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 @@ -1098,9 +1125,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: # ymm3 = mem[0,1,0,1] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll index cb64483731433..368fcd3e0e9a1 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -31,9 +31,10 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -90,9 +91,10 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -149,9 +151,10 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -216,9 +219,10 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -263,9 +267,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -294,9 +299,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -312,9 +318,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -359,9 +366,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm2 @@ -390,9 +398,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -408,9 +417,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -456,9 +466,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 @@ -480,9 +491,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -495,9 +507,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -511,9 +524,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 @@ -548,9 +562,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 @@ -572,9 +587,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -587,9 +603,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 ; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 @@ -603,9 +620,10 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll index 24f40b6fdf1be..650ee0e7e3f1d 100644 --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -353,15 +353,25 @@ define <16 x i1> @ugt_v16i8(<16 x i8> %x, <16 x i8> %y) { ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ugt_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ugt_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ugt_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp ugt <16 x i8> %sh1, %sh2 @@ -380,15 +390,25 @@ define <16 x i1> @ult_v16i8(<16 x i8> %x, <16 x i8> %y) { ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ult_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ult_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ult_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp ult <16 x i8> %sh1, %sh2 @@ -407,16 +427,27 @@ define <16 x i1> @uge_v16i8(<16 x i8> %x, <16 x i8> %y) { ; SSE-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: uge_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: uge_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uge_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp uge <16 x i8> %sh1, %sh2 @@ -435,16 +466,27 @@ define <16 x i1> @ule_v16i8(<16 x i8> %x, <16 x i8> %y) { ; SSE-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ule_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ule_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ule_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp ule <16 x i8> %sh1, %sh2 diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll index 1781196fc6f64..d180bbe9d53e0 100644 --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -447,14 +447,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) { ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_1: ; X64-SSE2: # %bb.0: @@ -465,14 +474,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_1(<16 x i8> %a0) { ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -487,14 +505,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) { ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_4: ; X64-SSE2: # %bb.0: @@ -505,14 +532,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_4(<16 x i8> %a0) { ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -527,14 +563,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) { ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_5: ; X64-SSE2: # %bb.0: @@ -545,14 +590,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_5(<16 x i8> %a0) { ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -567,14 +621,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) { ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_6: ; X64-SSE2: # %bb.0: @@ -585,14 +648,23 @@ define <16 x i8> @test_128_i8_x_16_224_mask_ashr_6(<16 x i8> %a0) { ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index 367e0993e76ba..6d42147bd7c45 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -182,7 +182,7 @@ define <32 x i8> @PR22706(<32 x i1> %x) { ; AVX512-LABEL: PR22706: ; AVX512: ## %bb.0: ; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512-NEXT: vpblendvb %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX512-NEXT: retq %tmp = select <32 x i1> %x, <32 x i8> , <32 x i8> diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll index 7a3e9af985497..3a813ccd41fdc 100644 --- a/llvm/test/CodeGen/X86/vselect-minmax.ll +++ b/llvm/test/CodeGen/X86/vselect-minmax.ll @@ -9521,7 +9521,7 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test181: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9589,7 +9589,7 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test182: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9657,7 +9657,7 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test183: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9725,7 +9725,7 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test184: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10027,7 +10027,7 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test189: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10095,7 +10095,7 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test190: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10163,7 +10163,7 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test191: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10231,7 +10231,7 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) { ; ; AVX2-LABEL: test192: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll index a0573a449646d..7a9b66d026a44 100644 --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -681,7 +681,7 @@ define <2 x i64> @blend_splatmax_mask_cond_v2i64(<2 x i64> %x, <2 x i64> %y, <2 ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX512F-NEXT: vptestnmq %zmm3, %zmm0, %k1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 diff --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll index fdbc361e85d22..e91b8d029bcb4 100644 --- a/llvm/test/CodeGen/X86/vselect-post-combine.ll +++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll @@ -5,7 +5,7 @@ define ptr @test_mul(ptr %addr) { ; AVX2-LABEL: test_mul: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpblendvb %xmm0, (%rdi), %xmm1, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vmovdqu %ymm0, 0 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index be720f59d978e..2fd2afdee8c11 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -483,39 +483,39 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -648,16 +648,16 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm9 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,4,0,4,0,4,0,4] ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 @@ -665,16 +665,16 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm8 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 ; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] ; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 ; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 ; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 @@ -682,32 +682,32 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] ; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 ; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] ; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 ; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 ; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 @@ -830,7 +830,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){ ; AVX2OR512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 ; AVX2OR512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX2OR512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX2OR512-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2OR512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -838,7 +839,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){ ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26] ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26] -; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX2OR512-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1 ; AVX2OR512-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] @@ -1025,7 +1027,8 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: # ymm4 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 @@ -1206,7 +1209,8 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 @@ -1399,7 +1403,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-NEXT: # ymm5 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] +; AVX2-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm5 ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5 @@ -1408,7 +1413,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm3 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 ; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm9 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u,1,4,7,10,13,0,3,6,9,12,15,u,u,u,u,u> +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0] +; AVX2-NEXT: # ymm10 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm9 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm8 ; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm8 @@ -1425,7 +1431,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpaddb %ymm3, %ymm8, %ymm3 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5] ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] ; AVX2-NEXT: # ymm7 = mem[0,1,0,1] @@ -1454,7 +1460,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] +; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1 ; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index 86737f28e28cc..8d6c1483d817f 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -3092,7 +3092,8 @@ define void @vec384_v48i8_to_v8i48_factor6(ptr %in.vec.base.ptr, ptr %in.vec.bia ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,1,u,u,2,u,u,3,u,u,4,u,u,5> +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,3,0,1,4,0,2,5,0,3,0,1,4,0,2,5] +; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 85bec77fe5eb2..c93d4a60898e5 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1221,7 +1221,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1346,7 +1346,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1466,7 +1466,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -2429,7 +2430,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2579,7 +2580,8 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -2723,7 +2725,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2874,7 +2876,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -3018,7 +3021,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3168,7 +3171,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -3310,7 +3314,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3455,10 +3460,12 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255> +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -5227,7 +5234,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7] +; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 @@ -5244,7 +5252,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u> +; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm1 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 @@ -5406,7 +5415,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,0,7] +; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,7,0,7] +; AVX512BW-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -5508,7 +5518,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u> +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 @@ -5525,7 +5536,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u> +; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0] +; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovaps 32(%rdx), %ymm1 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 2e00b5c9c91a5..0ea821b11bc96 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1012,7 +1012,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1110,7 +1110,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1202,7 +1202,8 @@ define void @vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2(ptr %i ; AVX2-LABEL: vec256_i8_widen_to_i128_factor16_broadcast_to_v2i128_factor2: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1909,7 +1910,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2039,7 +2040,8 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -2161,7 +2163,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16],zero,zero,zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2291,7 +2293,8 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -2413,7 +2416,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2543,7 +2546,8 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -2664,7 +2668,8 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2788,10 +2793,12 @@ define void @vec384_i8_widen_to_i192_factor24_broadcast_to_v2i192_factor2(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255> +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = mem[2,3],ymm0[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -4194,7 +4201,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512F-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7] +; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm0 @@ -4206,7 +4214,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512DQ-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = <16,29,30,31,4,5,16,7,u,u,u,u,u,u,u,u> +; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,16,29,30,31,4,5,16,7] +; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpermt2d (%rdi), %zmm0, %zmm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm0 @@ -4413,7 +4422,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u> +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0] +; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpaddb (%rsi), %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rsi), %ymm1 @@ -4426,7 +4436,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,u,u,u,u> +; AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm2 = [0,7,10,0,0,7,10,0] +; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovaps 32(%rsi), %ymm1 From f81f32adc9a8e99a77dd4c5f5f83c5595b989a71 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 30 May 2023 12:36:54 +0100 Subject: [PATCH 073/704] [X86] lowerBuildVectorAsBroadcast - remove repeated hasAVX() check. NFC. We already early-out at the top of the function. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 68 ++++++++++++------------- 1 file changed, 32 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a89ab94c9e0d7..112687061c827 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9849,42 +9849,38 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); LLVMContext *Ctx = DAG.getContext(); MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); - if (Subtarget.hasAVX()) { - if (SplatBitSize == 32 || SplatBitSize == 64 || - (SplatBitSize < 32 && Subtarget.hasAVX2())) { - // Load the constant scalar/subvector and broadcast it. - MVT CVT = MVT::getIntegerVT(SplatBitSize); - Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); - SDValue CP = DAG.getConstantPool(C, PVT); - unsigned Repeat = VT.getSizeInBits() / SplatBitSize; - - Align Alignment = cast(CP)->getAlign(); - SDVTList Tys = - DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other); - SDValue Ops[] = {DAG.getEntryNode(), CP}; - MachinePointerInfo MPI = - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); - SDValue Brdcst = DAG.getMemIntrinsicNode( - X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment, - MachineMemOperand::MOLoad); - return DAG.getBitcast(VT, Brdcst); - } - if (SplatBitSize > 64) { - // Load the vector of constants and broadcast it. - Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, - *Ctx); - SDValue VCP = DAG.getConstantPool(VecC, PVT); - unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); - MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm); - Align Alignment = cast(VCP)->getAlign(); - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = {DAG.getEntryNode(), VCP}; - MachinePointerInfo MPI = - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); - return DAG.getMemIntrinsicNode( - X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment, - MachineMemOperand::MOLoad); - } + if (SplatBitSize == 32 || SplatBitSize == 64 || + (SplatBitSize < 32 && Subtarget.hasAVX2())) { + // Load the constant scalar/subvector and broadcast it. + MVT CVT = MVT::getIntegerVT(SplatBitSize); + Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); + SDValue CP = DAG.getConstantPool(C, PVT); + unsigned Repeat = VT.getSizeInBits() / SplatBitSize; + + Align Alignment = cast(CP)->getAlign(); + SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other); + SDValue Ops[] = {DAG.getEntryNode(), CP}; + MachinePointerInfo MPI = + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); + SDValue Brdcst = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, + MPI, Alignment, MachineMemOperand::MOLoad); + return DAG.getBitcast(VT, Brdcst); + } + if (SplatBitSize > 64) { + // Load the vector of constants and broadcast it. + Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); + SDValue VCP = DAG.getConstantPool(VecC, PVT); + unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); + MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm); + Align Alignment = cast(VCP)->getAlign(); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {DAG.getEntryNode(), VCP}; + MachinePointerInfo MPI = + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); + return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, + Ops, VVT, MPI, Alignment, + MachineMemOperand::MOLoad); } } From 96a14f388b1a3507e5ae97b0a21b7b785d99a52b Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 30 May 2023 14:40:18 +0200 Subject: [PATCH 074/704] Revert "[FuncSpec] Replace LoopInfo with BlockFrequencyInfo" As reported on https://reviews.llvm.org/D150375#4367861 and following, this change causes PDT invalidation issues. Revert it and dependent commits. This reverts commit 0524534d5220da5ecb2cd424a46520184d2be366. This reverts commit ced90d1ff64a89a13479a37a3b17a411a3259f9f. This reverts commit 9f992cc9350a7f7072a6dbf018ea07142ea7a7ed. This reverts commit 1b1232047e83b69561fd64b9547cb0a0d374473a. --- .../Transforms/IPO/FunctionSpecialization.h | 60 +--- .../llvm/Transforms/Utils/SCCPSolver.h | 3 + .../Transforms/IPO/FunctionSpecialization.cpp | 312 +++--------------- llvm/lib/Transforms/IPO/SCCP.cpp | 16 +- llvm/lib/Transforms/Utils/SCCPSolver.cpp | 11 + llvm/test/Other/new-pm-defaults.ll | 45 +-- llvm/test/Other/new-pm-lto-defaults.ll | 15 +- .../Other/new-pm-thinlto-postlink-defaults.ll | 19 +- .../new-pm-thinlto-postlink-pgo-defaults.ll | 15 +- ...-pm-thinlto-postlink-samplepgo-defaults.ll | 15 +- .../compiler-crash-58759.ll | 2 +- .../function-specialization-always-inline.ll | 2 +- ...tion-specialization-constant-expression.ll | 50 ++- ...nction-specialization-constant-integers.ll | 2 +- .../function-specialization-loop.ll | 63 ++++ .../function-specialization-minsize3.ll | 2 +- .../function-specialization.ll | 4 +- .../function-specialization2.ll | 89 +++++ .../function-specialization3.ll | 4 +- .../get-possible-constants.ll | 2 +- .../FunctionSpecialization/global-rank.ll | 3 +- .../identical-specializations.ll | 12 +- .../FunctionSpecialization/literal-const.ll | 3 +- .../FunctionSpecialization/max-iters.ll | 110 ------ .../FunctionSpecialization/noinline.ll | 2 +- .../remove-dead-recursive-function.ll | 2 +- .../specialize-multiple-arguments.ll | 26 +- llvm/unittests/Transforms/IPO/CMakeLists.txt | 1 - .../IPO/FunctionSpecializationTest.cpp | 261 --------------- 29 files changed, 344 insertions(+), 807 deletions(-) create mode 100644 llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll create mode 100644 llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll delete mode 100644 llvm/test/Transforms/FunctionSpecialization/max-iters.ll delete mode 100644 llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h index e9ddff01f728c..e37386c85cfe6 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h @@ -48,11 +48,10 @@ #ifndef LLVM_TRANSFORMS_IPO_FUNCTIONSPECIALIZATION_H #define LLVM_TRANSFORMS_IPO_FUNCTIONSPECIALIZATION_H -#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/IR/InstVisitor.h" #include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/SCCPSolver.h" @@ -70,9 +69,6 @@ using SpecMap = DenseMap>; // Just a shorter abbreviation to improve indentation. using Cost = InstructionCost; -// Map of known constants found during the specialization bonus estimation. -using ConstMap = DenseMap; - // Specialization signature, used to uniquely designate a specialization within // a function. struct SpecSig { @@ -119,39 +115,6 @@ struct Spec { : F(F), Sig(S), Score(Score) {} }; -class InstCostVisitor : public InstVisitor { - const DataLayout &DL; - BlockFrequencyInfo &BFI; - TargetTransformInfo &TTI; - SCCPSolver &Solver; - - ConstMap KnownConstants; - - ConstMap::iterator LastVisited; - -public: - InstCostVisitor(const DataLayout &DL, BlockFrequencyInfo &BFI, - TargetTransformInfo &TTI, SCCPSolver &Solver) - : DL(DL), BFI(BFI), TTI(TTI), Solver(Solver) {} - - Cost getUserBonus(Instruction *User, Value *Use, Constant *C); - -private: - friend class InstVisitor; - - Cost estimateSwitchInst(SwitchInst &I); - Cost estimateBranchInst(BranchInst &I); - - Constant *visitInstruction(Instruction &I) { return nullptr; } - Constant *visitLoadInst(LoadInst &I); - Constant *visitGetElementPtrInst(GetElementPtrInst &I); - Constant *visitSelectInst(SelectInst &I); - Constant *visitCastInst(CastInst &I); - Constant *visitCmpInst(CmpInst &I); - Constant *visitUnaryOperator(UnaryOperator &I); - Constant *visitBinaryOperator(BinaryOperator &I); -}; - class FunctionSpecializer { /// The IPSCCP Solver. @@ -163,7 +126,6 @@ class FunctionSpecializer { FunctionAnalysisManager *FAM; /// Analyses used to help determine if a function should be specialized. - std::function GetBFI; std::function GetTLI; std::function GetTTI; std::function GetAC; @@ -175,12 +137,11 @@ class FunctionSpecializer { public: FunctionSpecializer( SCCPSolver &Solver, Module &M, FunctionAnalysisManager *FAM, - std::function GetBFI, std::function GetTLI, std::function GetTTI, std::function GetAC) - : Solver(Solver), M(M), FAM(FAM), GetBFI(GetBFI), GetTLI(GetTLI), - GetTTI(GetTTI), GetAC(GetAC) {} + : Solver(Solver), M(M), FAM(FAM), GetTLI(GetTLI), GetTTI(GetTTI), + GetAC(GetAC) {} ~FunctionSpecializer(); @@ -188,18 +149,6 @@ class FunctionSpecializer { bool run(); - static unsigned getBlockFreqMultiplier(); - - InstCostVisitor getInstCostVisitorFor(Function *F) { - auto &BFI = (GetBFI)(*F); - auto &TTI = (GetTTI)(*F); - return InstCostVisitor(M.getDataLayout(), BFI, TTI, Solver); - } - - /// Compute a bonus for replacing argument \p A with constant \p C. - Cost getSpecializationBonus(Argument *A, Constant *C, - InstCostVisitor &Visitor); - private: Constant *getPromotableAlloca(AllocaInst *Alloca, CallInst *Call); @@ -243,6 +192,9 @@ class FunctionSpecializer { /// Compute and return the cost of specializing function \p F. Cost getSpecializationCost(Function *F); + /// Compute a bonus for replacing argument \p A with constant \p C. + Cost getSpecializationBonus(Argument *A, Constant *C, const LoopInfo &LI); + /// Determine if it is possible to specialise the function for constant values /// of the formal parameter \p A. bool isArgumentInteresting(Argument *A); diff --git a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h index 3a63290b3f4fd..cf3c3b7eee49f 100644 --- a/llvm/include/llvm/Transforms/Utils/SCCPSolver.h +++ b/llvm/include/llvm/Transforms/Utils/SCCPSolver.h @@ -44,6 +44,7 @@ struct AnalysisResultsForFn { std::unique_ptr PredInfo; DominatorTree *DT; PostDominatorTree *PDT; + LoopInfo *LI; }; /// Helper struct shared between Function Specialization and SCCP Solver. @@ -90,6 +91,8 @@ class SCCPSolver { const PredicateBase *getPredicateInfoFor(Instruction *I); + const LoopInfo &getLoopInfo(Function &F); + DomTreeUpdater getDTU(Function &F); /// trackValueOfGlobalVariable - Clients can use this method to diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index a635d7b4d40aa..51f1319a68122 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -48,14 +48,12 @@ #include "llvm/Transforms/IPO/FunctionSpecialization.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InlineCost.h" -#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/ConstantFold.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -74,22 +72,6 @@ static cl::opt ForceSpecialization( "Force function specialization for every call site with a constant " "argument")); -// Set to 2^3 to model three levels of if-else nest. -static cl::opt BlockFreqMultiplier( - "funcspec-block-freq-multiplier", cl::init(8), cl::Hidden, cl::desc( - "Multiplier to scale block frequency of user instructions during " - "specialization bonus estimation")); - -static cl::opt MinEntryFreq( - "funcspec-min-entry-freq", cl::init(450), cl::Hidden, cl::desc( - "Do not specialize functions with entry block frequency lower than " - "this value")); - -static cl::opt MinScore( - "funcspec-min-score", cl::init(2), cl::Hidden, cl::desc( - "Do not specialize functions with score lower than this value " - "(the ratio of specialization bonus over specialization cost)")); - static cl::opt MaxClones( "funcspec-max-clones", cl::init(3), cl::Hidden, cl::desc( "The maximum number of clones allowed for a single function " @@ -100,225 +82,23 @@ static cl::opt MinFunctionSize( "Don't specialize functions that have less than this number of " "instructions")); +static cl::opt AvgLoopIters( + "funcspec-avg-loop-iters", cl::init(10), cl::Hidden, cl::desc( + "Average loop iteration count")); + static cl::opt SpecializeOnAddress( "funcspec-on-address", cl::init(false), cl::Hidden, cl::desc( "Enable function specialization on the address of global values")); +// Disabled by default as it can significantly increase compilation times. +// +// https://llvm-compile-time-tracker.com +// https://github.com/nikic/llvm-compile-time-tracker static cl::opt SpecializeLiteralConstant( - "funcspec-for-literal-constant", cl::init(true), cl::Hidden, cl::desc( + "funcspec-for-literal-constant", cl::init(false), cl::Hidden, cl::desc( "Enable specialization of functions that take a literal constant as an " "argument")); -unsigned FunctionSpecializer::getBlockFreqMultiplier() { - return BlockFreqMultiplier; -} - -// Estimates the instruction cost of all the basic blocks in \p WorkList. -// The successors of such blocks are added to the list as long as they are -// executable and they have a unique predecessor. \p WorkList represents -// the basic blocks of a specialization which become dead once we replace -// instructions that are known to be constants. The aim here is to estimate -// the combination of size and latency savings in comparison to the non -// specialized version of the function. -static Cost estimateBasicBlocks(SmallVectorImpl &WorkList, - ConstMap &KnownConstants, SCCPSolver &Solver, - BlockFrequencyInfo &BFI, - TargetTransformInfo &TTI) { - Cost Bonus = 0; - - // Accumulate the instruction cost of each basic block weighted by frequency. - while (!WorkList.empty()) { - BasicBlock *BB = WorkList.pop_back_val(); - - uint64_t Weight = BlockFreqMultiplier * - BFI.getBlockFreq(BB).getFrequency() / - BFI.getEntryFreq(); - if (!Weight) - continue; - - for (Instruction &I : *BB) { - // Disregard SSA copies. - if (auto *II = dyn_cast(&I)) - if (II->getIntrinsicID() == Intrinsic::ssa_copy) - continue; - // If it's a known constant we have already accounted for it. - if (KnownConstants.contains(&I)) - continue; - - Bonus += Weight * - TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency); - - LLVM_DEBUG(dbgs() << "FnSpecialization: Bonus " << Bonus - << " after user " << I << "\n"); - } - - // Keep adding dead successors to the list as long as they are - // executable and they have a unique predecessor. - for (BasicBlock *SuccBB : successors(BB)) - if (Solver.isBlockExecutable(SuccBB) && - SuccBB->getUniquePredecessor() == BB) - WorkList.push_back(SuccBB); - } - return Bonus; -} - -static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) { - if (auto It = KnownConstants.find(V); It != KnownConstants.end()) - return It->second; - return nullptr; -} - -Cost InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) { - // Cache the iterator before visiting. - LastVisited = KnownConstants.insert({Use, C}).first; - - if (auto *I = dyn_cast(User)) - return estimateSwitchInst(*I); - - if (auto *I = dyn_cast(User)) - return estimateBranchInst(*I); - - C = visit(*User); - if (!C) - return 0; - - KnownConstants.insert({User, C}); - - uint64_t Weight = BlockFreqMultiplier * - BFI.getBlockFreq(User->getParent()).getFrequency() / - BFI.getEntryFreq(); - if (!Weight) - return 0; - - Cost Bonus = Weight * - TTI.getInstructionCost(User, TargetTransformInfo::TCK_SizeAndLatency); - - LLVM_DEBUG(dbgs() << "FnSpecialization: Bonus " << Bonus - << " for user " << *User << "\n"); - - for (auto *U : User->users()) - if (auto *UI = dyn_cast(U)) - if (Solver.isBlockExecutable(UI->getParent())) - Bonus += getUserBonus(UI, User, C); - - return Bonus; -} - -Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) { - if (I.getCondition() != LastVisited->first) - return 0; - - auto *C = cast(LastVisited->second); - BasicBlock *Succ = I.findCaseValue(C)->getCaseSuccessor(); - // Initialize the worklist with the dead basic blocks. These are the - // destination labels which are different from the one corresponding - // to \p C. They should be executable and have a unique predecessor. - SmallVector WorkList; - for (const auto &Case : I.cases()) { - BasicBlock *BB = Case.getCaseSuccessor(); - if (BB == Succ || !Solver.isBlockExecutable(BB) || - BB->getUniquePredecessor() != I.getParent()) - continue; - WorkList.push_back(BB); - } - - return estimateBasicBlocks(WorkList, KnownConstants, Solver, BFI, TTI); -} - -Cost InstCostVisitor::estimateBranchInst(BranchInst &I) { - if (I.getCondition() != LastVisited->first) - return 0; - - BasicBlock *Succ = I.getSuccessor(LastVisited->second->isOneValue()); - // Initialize the worklist with the dead successor as long as - // it is executable and has a unique predecessor. - SmallVector WorkList; - if (Solver.isBlockExecutable(Succ) && - Succ->getUniquePredecessor() == I.getParent()) - WorkList.push_back(Succ); - - return estimateBasicBlocks(WorkList, KnownConstants, Solver, BFI, TTI); -} - -Constant *InstCostVisitor::visitLoadInst(LoadInst &I) { - if (isa(LastVisited->second)) - return nullptr; - return ConstantFoldLoadFromConstPtr(LastVisited->second, I.getType(), DL); -} - -Constant *InstCostVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { - SmallVector Operands; - Operands.reserve(I.getNumOperands()); - - for (unsigned Idx = 0, E = I.getNumOperands(); Idx != E; ++Idx) { - Value *V = I.getOperand(Idx); - auto *C = dyn_cast(V); - if (!C) - C = findConstantFor(V, KnownConstants); - if (!C) - return nullptr; - Operands.push_back(C); - } - - auto *Ptr = cast(Operands[0]); - auto Ops = ArrayRef(Operands.begin() + 1, Operands.end()); - return ConstantFoldGetElementPtr(I.getSourceElementType(), Ptr, - I.isInBounds(), std::nullopt, Ops); -} - -Constant *InstCostVisitor::visitSelectInst(SelectInst &I) { - if (I.getCondition() != LastVisited->first) - return nullptr; - - Value *V = LastVisited->second->isZeroValue() ? I.getFalseValue() - : I.getTrueValue(); - auto *C = dyn_cast(V); - if (!C) - C = findConstantFor(V, KnownConstants); - return C; -} - -Constant *InstCostVisitor::visitCastInst(CastInst &I) { - return ConstantFoldCastOperand(I.getOpcode(), LastVisited->second, - I.getType(), DL); -} - -Constant *InstCostVisitor::visitCmpInst(CmpInst &I) { - bool Swap = I.getOperand(1) == LastVisited->first; - Value *V = Swap ? I.getOperand(0) : I.getOperand(1); - auto *Other = dyn_cast(V); - if (!Other) - Other = findConstantFor(V, KnownConstants); - - if (!Other) - return nullptr; - - Constant *Const = LastVisited->second; - return Swap ? - ConstantFoldCompareInstOperands(I.getPredicate(), Other, Const, DL) - : ConstantFoldCompareInstOperands(I.getPredicate(), Const, Other, DL); -} - -Constant *InstCostVisitor::visitUnaryOperator(UnaryOperator &I) { - return ConstantFoldUnaryOpOperand(I.getOpcode(), LastVisited->second, DL); -} - -Constant *InstCostVisitor::visitBinaryOperator(BinaryOperator &I) { - bool Swap = I.getOperand(1) == LastVisited->first; - Value *V = Swap ? I.getOperand(0) : I.getOperand(1); - auto *Other = dyn_cast(V); - if (!Other) - Other = findConstantFor(V, KnownConstants); - - if (!Other) - return nullptr; - - Constant *Const = LastVisited->second; - return dyn_cast_or_null(Swap ? - simplifyBinOp(I.getOpcode(), Other, Const, SimplifyQuery(DL)) - : simplifyBinOp(I.getOpcode(), Const, Other, SimplifyQuery(DL))); -} - Constant *FunctionSpecializer::getPromotableAlloca(AllocaInst *Alloca, CallInst *Call) { Value *StoreValue = nullptr; @@ -637,6 +417,10 @@ CodeMetrics &FunctionSpecializer::analyzeFunction(Function *F) { CodeMetrics::collectEphemeralValues(F, &(GetAC)(*F), EphValues); for (BasicBlock &BB : *F) Metrics.analyzeBasicBlock(&BB, (GetTTI)(*F), EphValues); + + LLVM_DEBUG(dbgs() << "FnSpecialization: Code size of function " + << F->getName() << " is " << Metrics.NumInsts + << " instructions\n"); } return Metrics; } @@ -667,7 +451,6 @@ bool FunctionSpecializer::findSpecializations(Function *F, Cost SpecCost, if (Args.empty()) return false; - bool HasCheckedEntryFreq = false; for (User *U : F->users()) { if (!isa(U) && !isa(U)) continue; @@ -703,21 +486,6 @@ bool FunctionSpecializer::findSpecializations(Function *F, Cost SpecCost, if (S.Args.empty()) continue; - // Check the function entry frequency only once. We sink this code here to - // postpone running the Block Frequency Analysis until we know for sure - // there are Specialization candidates, otherwise we are adding unnecessary - // overhead. - if (!HasCheckedEntryFreq) { - // Reject cold functions (for some definition of 'cold'). - uint64_t EntryFreq = (GetBFI)(*F).getEntryFreq(); - if (!ForceSpecialization && EntryFreq < MinEntryFreq) - return false; - - HasCheckedEntryFreq = true; - LLVM_DEBUG(dbgs() << "FnSpecialization: Entry block frequency for " - << F->getName() << " = " << EntryFreq << "\n"); - } - // Check if we have encountered the same specialisation already. if (auto It = UniqueSpecs.find(S); It != UniqueSpecs.end()) { // Existing specialisation. Add the call to the list to rewrite, unless @@ -732,14 +500,13 @@ bool FunctionSpecializer::findSpecializations(Function *F, Cost SpecCost, AllSpecs[Index].CallSites.push_back(&CS); } else { // Calculate the specialisation gain. - Cost Score = 0; - InstCostVisitor Visitor = getInstCostVisitorFor(F); + Cost Score = 0 - SpecCost; for (ArgInfo &A : S.Args) - Score += getSpecializationBonus(A.Formal, A.Actual, Visitor); - Score /= SpecCost; + Score += + getSpecializationBonus(A.Formal, A.Actual, Solver.getLoopInfo(*F)); // Discard unprofitable specialisations. - if (!ForceSpecialization && Score < MinScore) + if (!ForceSpecialization && Score <= 0) continue; // Create a new specialisation entry. @@ -823,23 +590,48 @@ Cost FunctionSpecializer::getSpecializationCost(Function *F) { // Otherwise, set the specialization cost to be the cost of all the // instructions in the function. - return Metrics.NumInsts; + return Metrics.NumInsts * InlineConstants::getInstrCost(); +} + +static Cost getUserBonus(User *U, TargetTransformInfo &TTI, + const LoopInfo &LI) { + auto *I = dyn_cast_or_null(U); + // If not an instruction we do not know how to evaluate. + // Keep minimum possible cost for now so that it doesnt affect + // specialization. + if (!I) + return std::numeric_limits::min(); + + Cost Bonus = + TTI.getInstructionCost(U, TargetTransformInfo::TCK_SizeAndLatency); + + // Increase the cost if it is inside the loop. + unsigned LoopDepth = LI.getLoopDepth(I->getParent()); + Bonus *= std::pow((double)AvgLoopIters, LoopDepth); + + // Traverse recursively if there are more uses. + // TODO: Any other instructions to be added here? + if (I->mayReadFromMemory() || I->isCast()) + for (auto *User : I->users()) + Bonus += getUserBonus(User, TTI, LI); + + return Bonus; } /// Compute a bonus for replacing argument \p A with constant \p C. Cost FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C, - InstCostVisitor &Visitor) { + const LoopInfo &LI) { + Function *F = A->getParent(); + auto &TTI = (GetTTI)(*F); LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: " << C->getNameOrAsOperand() << "\n"); Cost TotalCost = 0; - for (auto *U : A->users()) - if (auto *UI = dyn_cast(U)) - if (Solver.isBlockExecutable(UI->getParent())) - TotalCost += Visitor.getUserBonus(UI, A, C); - - LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated user bonus " - << TotalCost << " for argument " << *A << "\n"); + for (auto *U : A->users()) { + TotalCost += getUserBonus(U, TTI, LI); + LLVM_DEBUG(dbgs() << "FnSpecialization: User cost "; + TotalCost.print(dbgs()); dbgs() << " for: " << *U << "\n"); + } // The below heuristic is only concerned with exposing inlining // opportunities via indirect call promotion. If the argument is not a diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp index 21b89ce4de403..5e2a23b9e62df 100644 --- a/llvm/lib/Transforms/IPO/SCCP.cpp +++ b/llvm/lib/Transforms/IPO/SCCP.cpp @@ -13,7 +13,7 @@ #include "llvm/Transforms/IPO/SCCP.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -107,15 +107,13 @@ static void findReturnsToZap(Function &F, static bool runIPSCCP( Module &M, const DataLayout &DL, FunctionAnalysisManager *FAM, - std::function GetBFI, std::function GetTLI, std::function GetTTI, std::function GetAC, function_ref getAnalysis, bool IsFuncSpecEnabled) { SCCPSolver Solver(DL, GetTLI, M.getContext()); - FunctionSpecializer Specializer(Solver, M, FAM, GetBFI, GetTLI, GetTTI, - GetAC); + FunctionSpecializer Specializer(Solver, M, FAM, GetTLI, GetTTI, GetAC); // Loop over all functions, marking arguments to those with their addresses // taken or that are external as overdefined. @@ -383,23 +381,21 @@ PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) { auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & { return FAM.getResult(F); }; - auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & { - return FAM.getResult(F); - }; auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { return FAM.getResult(F); }; auto GetAC = [&FAM](Function &F) -> AssumptionCache & { return FAM.getResult(F); }; - auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn { + auto getAnalysis = [&FAM, this](Function &F) -> AnalysisResultsForFn { DominatorTree &DT = FAM.getResult(F); return { std::make_unique(F, DT, FAM.getResult(F)), - &DT, FAM.getCachedResult(F) }; + &DT, FAM.getCachedResult(F), + isFuncSpecEnabled() ? &FAM.getResult(F) : nullptr }; }; - if (!runIPSCCP(M, DL, &FAM, GetBFI, GetTLI, GetTTI, GetAC, getAnalysis, + if (!runIPSCCP(M, DL, &FAM, GetTLI, GetTTI, GetAC, getAnalysis, isFuncSpecEnabled())) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 7d045221bde36..881c3cc7b56f6 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -664,6 +664,13 @@ class SCCPInstVisitor : public InstVisitor { return A->second.PredInfo->getPredicateInfoFor(I); } + const LoopInfo &getLoopInfo(Function &F) { + auto A = AnalysisResults.find(&F); + assert(A != AnalysisResults.end() && A->second.LI && + "Need LoopInfo analysis results for function."); + return *A->second.LI; + } + DomTreeUpdater getDTU(Function &F) { auto A = AnalysisResults.find(&F); assert(A != AnalysisResults.end() && "Need analysis results for function."); @@ -1955,6 +1962,10 @@ const PredicateBase *SCCPSolver::getPredicateInfoFor(Instruction *I) { return Visitor->getPredicateInfoFor(I); } +const LoopInfo &SCCPSolver::getLoopInfo(Function &F) { + return Visitor->getLoopInfo(F); +} + DomTreeUpdater SCCPSolver::getDTU(Function &F) { return Visitor->getDTU(F); } void SCCPSolver::trackValueOfGlobalVariable(GlobalVariable *GV) { diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index 5cb9a7f331a68..59770fa6b1776 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -9,83 +9,83 @@ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O1,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O1,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O2,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O2,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-Os,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-NO-FUNC-SPEC,CHECK-Os,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-Oz,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-NO-FUNC-SPEC,CHECK-Oz,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='lto-pre-link' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-LTO,CHECK-O2,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-LTO,CHECK-NO-FUNC-SPEC,CHECK-O2,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-peephole='no-op-function' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-PEEPHOLE,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-PEEPHOLE,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-late-loop-optimizations='no-op-loop' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-LOOP-LATE,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-LOOP-LATE,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-loop-optimizer-end='no-op-loop' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-LOOP-END,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-LOOP-END,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-scalar-optimizer-late='no-op-function' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-SCALAR-LATE,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-SCALAR-LATE,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-cgscc-optimizer-late='no-op-cgscc' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-CGSCC-LATE,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-CGSCC-LATE,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-vectorizer-start='no-op-function' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-VECTORIZER-START,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-VECTORIZER-START,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-pipeline-start='no-op-module' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-pipeline-early-simplification='no-op-module' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-EARLY-SIMPLIFICATION,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-EARLY-SIMPLIFICATION,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-pipeline-start='no-op-module' \ ; RUN: -passes='lto-pre-link' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-LTO,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-LTO,CHECK-NO-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-optimizer-early='no-op-module' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-EARLY,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-EARLY,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes-ep-optimizer-last='no-op-module' \ ; RUN: -passes='default' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -enable-matrix -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -enable-merge-functions -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MERGE-FUNCS +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MERGE-FUNCS ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -ir-outliner -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-IR-OUTLINER +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-IR-OUTLINER ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='default' -hot-cold-split -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-HOT-COLD-SPLIT +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-HOT-COLD-SPLIT ; Suppress FileCheck --allow-unused-prefixes=false diagnostics. ; CHECK-Oz: {{^}} @@ -109,6 +109,7 @@ ; CHECK-O-NEXT: Running pass: OpenMPOptPass ; CHECK-EP-PIPELINE-EARLY-SIMPLIFICATION-NEXT: Running pass: NoOpModulePass ; CHECK-O-NEXT: Running pass: IPSCCPPass +; CHECK-FUNC-SPEC-NEXT: Running analysis: LoopAnalysis ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass ; CHECK-O-NEXT: Running pass: GlobalOptPass ; CHECK-O-NEXT: Running pass: PromotePass @@ -163,7 +164,7 @@ ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: ReassociatePass ; CHECK-O-NEXT: Running pass: LoopSimplifyPass -; CHECK-O-NEXT: Running analysis: LoopAnalysis +; CHECK-NO-FUNC-SPEC-NEXT: Running analysis: LoopAnalysis ; CHECK-O-NEXT: Running pass: LCSSAPass ; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll index 3c0f2bbdc06bc..7f0b335b867d0 100644 --- a/llvm/test/Other/new-pm-lto-defaults.ll +++ b/llvm/test/Other/new-pm-lto-defaults.ll @@ -9,23 +9,23 @@ ; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-EP ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='lto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='lto' -S %s -passes-ep-full-link-time-optimization-early=no-op-module \ ; RUN: -passes-ep-full-link-time-optimization-last=no-op-module 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-EP +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23,CHECK-O23SZ,CHECK-EP ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='lto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='lto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-OS,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-OS,CHECK-OSZ,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='lto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-O23SZ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='lto' -S %s -passes-ep-peephole='no-op-function' 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-EP-Peephole +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23,CHECK-O23SZ,CHECK-EP-Peephole ; CHECK-EP: Running pass: NoOpModulePass ; CHECK-O: Running pass: CrossDSOCFIPass @@ -43,6 +43,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis ; CHECK-O23SZ-NEXT: Running pass: IPSCCPPass ; CHECK-O23SZ-NEXT: Running analysis: AssumptionAnalysis on foo +; CHECK-O23-NEXT: Running analysis: LoopAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: CalledValuePropagationPass ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}SCC ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis @@ -93,7 +94,7 @@ ; CHECK-O23SZ-NEXT: Invalidating analysis: AAManager on foo ; CHECK-O23SZ-NEXT: Running pass: OpenMPOptCGSCCPass on (foo) ; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass on foo -; CHECK-O23SZ-NEXT: Running analysis: LoopAnalysis on foo +; CHECK-OSZ-NEXT: Running analysis: LoopAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: LCSSAPass on foo ; CHECK-O23SZ-NEXT: Running analysis: MemorySSAAnalysis on foo ; CHECK-O23SZ-NEXT: Running analysis: AAManager on foo diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll index aa3b8e85749d9..dc97fd516311f 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -10,28 +10,28 @@ ; Postlink pipelines: ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-POSTLINK-O,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O1,CHECK-POSTLINK-O,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2 +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-pipeline-start='no-op-module' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3 +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-optimizer-early='no-op-module' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3,CHECK-POST-EP-OPT-EARLY +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3,CHECK-POST-EP-OPT-EARLY ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-optimizer-last='no-op-module' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3,CHECK-POST-EP-OPT-LAST +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O3,CHECK-POST-EP-OPT-LAST ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-Os +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-NO-FUNC-SPEC,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-Os ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-NO-FUNC-SPEC,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -debug-info-for-profiling \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2 +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-FUNC-SPEC,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,%llvmcheckext,CHECK-POSTLINK-O2 ; Suppress FileCheck --allow-unused-prefixes=false diagnostics. ; CHECK-NOEXT: {{^}} @@ -49,6 +49,7 @@ ; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-O-NEXT: Running analysis: AssumptionAnalysis ; CHECK-O-NEXT: Running analysis: TargetIRAnalysis +; CHECK-FUNC-SPEC-NEXT: Running analysis: LoopAnalysis ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass ; CHECK-O-NEXT: Running pass: GlobalOptPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis @@ -100,7 +101,7 @@ ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: ReassociatePass ; CHECK-O-NEXT: Running pass: LoopSimplifyPass -; CHECK-O-NEXT: Running analysis: LoopAnalysis +; CHECK-NO-FUNC-SPEC-NEXT: Running analysis: LoopAnalysis ; CHECK-O-NEXT: Running pass: LCSSAPass ; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll index bfa3ed6e4b757..e08019fa38d35 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -3,22 +3,22 @@ ; Postlink pipelines: ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O1,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O2,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-pipeline-start='no-op-module' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O3,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-Os,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -debug-info-for-profiling \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O2,CHECK-O23SZ,%llvmcheckext ; Suppress FileCheck --allow-unused-prefixes=false diagnostics. ; CHECK-NOEXT: {{^}} @@ -34,6 +34,7 @@ ; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-O-NEXT: Running analysis: AssumptionAnalysis ; CHECK-O-NEXT: Running analysis: TargetIRAnalysis +; CHECK-O123-NEXT: Running analysis: LoopAnalysis on foo ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass ; CHECK-O-NEXT: Running pass: GlobalOptPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis @@ -47,7 +48,7 @@ ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo -; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo +; CHECK-OSZ-NEXT: Running analysis: LoopAnalysis on foo ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll index 004ec790e9847..66ee95f398a8d 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -3,27 +3,27 @@ ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O1,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O2,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -passes-ep-pipeline-start='no-op-module' \ ; RUN: -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O3,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-Os,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \ ; RUN: -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-OSZ,CHECK-O23SZ,%llvmcheckext ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager -debug-info-for-profiling \ ; RUN: -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-thinlto-samplepgo-defaults.prof' \ ; RUN: -passes='thinlto' -S %s 2>&1 \ -; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,%llvmcheckext +; RUN: | FileCheck %s --check-prefixes=CHECK-O,CHECK-O123,CHECK-O2,CHECK-O23SZ,%llvmcheckext ; Suppress FileCheck --allow-unused-prefixes=false diagnostics. ; CHECK-NOEXT: {{^}} @@ -43,6 +43,7 @@ ; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis ; CHECK-O-NEXT: Running analysis: AssumptionAnalysis ; CHECK-O-NEXT: Running analysis: TargetIRAnalysis +; CHECK-O123-NEXT: Running analysis: LoopAnalysis on foo ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass ; CHECK-O-NEXT: Running pass: GlobalOptPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis @@ -55,7 +56,7 @@ ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo -; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo +; CHECK-OSZ-NEXT: Running analysis: LoopAnalysis on foo ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo ; CHECK-O-NEXT: Running pass: SimplifyCFGPass on foo ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass diff --git a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-58759.ll b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-58759.ll index 7c390dadef777..5cbfaade98d3c 100644 --- a/llvm/test/Transforms/FunctionSpecialization/compiler-crash-58759.ll +++ b/llvm/test/Transforms/FunctionSpecialization/compiler-crash-58759.ll @@ -1,4 +1,4 @@ -; RUN: opt -S --passes="default" -force-specialization < %s | FileCheck %s +; RUN: opt -S --passes="default" < %s | FileCheck %s define dso_local i32 @g0(i32 noundef %x) local_unnamed_addr { entry: diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll index b5a0084ed52e6..b6e726ea46af0 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-always-inline.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes="ipsccp" -force-specialization -S < %s | FileCheck %s +; RUN: opt -passes="ipsccp" -funcspec-avg-loop-iters=3 -funcspec-min-function-size=10 -S < %s | FileCheck %s ; CHECK-NOT: foo.{{[0-9]+}} diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll index ef73ed63b863b..003f80fa260ff 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-expression.ll @@ -1,9 +1,11 @@ -; RUN: opt -passes="ipsccp" -force-specialization -S < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; Test function specialization wouldn't crash due to constant expression. ; Note that this test case shows that function specialization pass would ; transform the function even if no specialization happened. +; RUN: opt -passes="ipsccp" -force-specialization -S < %s | FileCheck %s + %struct = type { i8, i16, i32, i64, i64} @Global = internal constant %struct {i8 0, i16 1, i32 2, i64 3, i64 4} @@ -24,6 +26,19 @@ entry: } define internal i64 @zoo(i1 %flag) { +; CHECK-LABEL: @zoo( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]] +; CHECK: plus: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @func2.2(ptr getelementptr inbounds ([[STRUCT:%.*]], ptr @Global, i32 0, i32 3)) +; CHECK-NEXT: br label [[MERGE:%.*]] +; CHECK: minus: +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @func2.1(ptr getelementptr inbounds ([[STRUCT]], ptr @Global, i32 0, i32 4)) +; CHECK-NEXT: br label [[MERGE]] +; CHECK: merge: +; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ ptrtoint (ptr getelementptr inbounds ([[STRUCT:%.*]], ptr @Global, i32 0, i32 3) to i64), [[PLUS]] ], [ ptrtoint (ptr getelementptr inbounds ([[STRUCT:%.*]], ptr @Global, i32 0, i32 4) to i64), [[MINUS]] ] +; CHECK-NEXT: ret i64 [[TMP2]] +; entry: br i1 %flag, label %plus, label %minus @@ -45,9 +60,10 @@ merge: define i64 @main() { ; CHECK-LABEL: @main( -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @zoo.4(i1 false) -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @zoo.3(i1 true) -; CHECK-NEXT: ret i64 add (i64 ptrtoint (ptr getelementptr inbounds ([[STRUCT:%.*]], ptr @Global, i32 0, i32 4) to i64), i64 ptrtoint (ptr getelementptr inbounds ([[STRUCT]], ptr @Global, i32 0, i32 3) to i64)) +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @zoo(i1 false) +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @zoo(i1 true) +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i64 [[TMP3]] ; %1 = call i64 @zoo(i1 0) %2 = call i64 @zoo(i1 1) @@ -55,29 +71,3 @@ define i64 @main() { ret i64 %3 } -; CHECK-LABEL: @func2.1( -; CHECK-NEXT: entry: -; CHECK-NEXT: ret i64 undef - -; CHECK-LABEL: @func2.2( -; CHECK-NEXT: entry: -; CHECK-NEXT: ret i64 undef - -; CHECK-LABEL: @zoo.3( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[PLUS:%.*]] -; CHECK: plus: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @func2.2(ptr getelementptr inbounds ([[STRUCT:%.*]], ptr @Global, i32 0, i32 3)) -; CHECK-NEXT: br label [[MERGE:%.*]] -; CHECK: merge: -; CHECK-NEXT: ret i64 undef - -; CHECK-LABEL: @zoo.4( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[MINUS:%.*]] -; CHECK: minus: -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @func2.1(ptr getelementptr inbounds ([[STRUCT:%.*]], ptr @Global, i32 0, i32 4)) -; CHECK-NEXT: br label [[MERGE:%.*]] -; CHECK: merge: -; CHECK-NEXT: ret i64 undef - diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll index 976a326a4a886..1b00e1032bd43 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-constant-integers.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes="ipsccp" -funcspec-for-literal-constant=true -force-specialization -S < %s | FileCheck %s +; RUN: opt -passes="ipsccp" -funcspec-for-literal-constant=true -funcspec-min-function-size=10 -S < %s | FileCheck %s ; Check that the literal constant parameter could be specialized. ; CHECK: @foo.1( diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll new file mode 100644 index 0000000000000..0e6fe4ce19872 --- /dev/null +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-loop.ll @@ -0,0 +1,63 @@ +; RUN: opt -passes="ipsccp" -funcspec-avg-loop-iters=5 -funcspec-min-function-size=10 -S < %s | FileCheck %s + +; Check that the loop depth results in a larger specialization bonus. +; CHECK: @foo.1( +; CHECK: @foo.2( + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +@A = external dso_local constant i32, align 4 +@B = external dso_local constant i32, align 4 +@C = external dso_local constant i32, align 4 +@D = external dso_local constant i32, align 4 + +declare i1 @cond_begin() +declare i1 @cond_end() +declare i1 @getCond() + +define internal i32 @foo(i32 %x, ptr %b, ptr %c) { +entry: + br label %loop.entry + +loop.entry: + br label %loop2.entry + +loop2.entry: + br label %loop2.body + +loop2.body: + %0 = load i32, ptr %b, align 4 + %1 = load i32, ptr %c, align 4 + %add.0 = add nsw i32 %0, %1 + %add = add nsw i32 %add.0, %x + br label %loop2.end + +loop2.end: + %cond.end = call i1 @cond_end() + br i1 %cond.end, label %loop2.entry, label %loop.end + +loop.end: + %cond2.end = call i1 @getCond() + br i1 %cond2.end, label %loop.entry, label %return + +return: + ret i32 %add +} + +define dso_local i32 @bar(i32 %x, i32 %y) { +entry: + %tobool = icmp ne i32 %x, 0 + br i1 %tobool, label %if.then, label %if.else + +if.then: + %call = call i32 @foo(i32 %x, ptr @A, ptr @C) + br label %return + +if.else: + %call1 = call i32 @foo(i32 %y, ptr @B, ptr @D) + br label %return + +return: + %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ] + ret i32 %retval.0 +} diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll index 609058764262b..525721f03cfb2 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization-minsize3.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes="ipsccp" -force-specialization -S < %s | FileCheck %s +; RUN: opt -passes="ipsccp" -funcspec-min-function-size=3 -S < %s | FileCheck %s ; Checks for callsites that have been annotated with MinSize. We only expect ; specialisation for the call that does not have the attribute: diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll index 21be617fd5c3b..b5d16f6dab1c0 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization.ll @@ -1,5 +1,5 @@ -; RUN: opt -passes="ipsccp" -force-specialization -S < %s | FileCheck %s -; RUN: opt -passes="ipsccp" -force-specialization -S < %s | FileCheck %s --check-prefix=NOFSPEC +; RUN: opt -passes="ipsccp" -funcspec-min-function-size=3 -S < %s | FileCheck %s +; RUN: opt -passes="ipsccp" -funcspec-min-function-size=3 -S < %s | FileCheck %s --check-prefix=NOFSPEC define i64 @main(i64 %x, i1 %flag) { ; diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll new file mode 100644 index 0000000000000..c7ef3e8ef520a --- /dev/null +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization2.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="ipsccp,deadargelim" -force-specialization -S < %s | FileCheck %s +; RUN: opt -passes="ipsccp,deadargelim" -funcspec-max-iters=1 -force-specialization -S < %s | FileCheck %s +; RUN: opt -passes="ipsccp,deadargelim" -funcspec-max-iters=0 -force-specialization -S < %s | FileCheck %s --check-prefix=DISABLED +; RUN: opt -passes="ipsccp,deadargelim" -funcspec-avg-loop-iters=1 -force-specialization -S < %s | FileCheck %s + +; DISABLED-NOT: @func.1( +; DISABLED-NOT: @func.2( + +define internal i32 @func(ptr %0, i32 %1, ptr nocapture %2) { + %4 = alloca i32, align 4 + store i32 %1, ptr %4, align 4 + %5 = load i32, ptr %4, align 4 + %6 = icmp slt i32 %5, 1 + br i1 %6, label %14, label %7 + +7: ; preds = %3 + %8 = load i32, ptr %4, align 4 + %9 = sext i32 %8 to i64 + %10 = getelementptr inbounds i32, ptr %0, i64 %9 + call void %2(ptr %10) + %11 = load i32, ptr %4, align 4 + %12 = add nsw i32 %11, -1 + %13 = call i32 @func(ptr %0, i32 %12, ptr %2) + br label %14 + +14: ; preds = %3, %7 + ret i32 0 +} + +define internal void @increment(ptr nocapture %0) { + %2 = load i32, ptr %0, align 4 + %3 = add nsw i32 %2, 1 + store i32 %3, ptr %0, align 4 + ret void +} + +define internal void @decrement(ptr nocapture %0) { + %2 = load i32, ptr %0, align 4 + %3 = add nsw i32 %2, -1 + store i32 %3, ptr %0, align 4 + ret void +} + +define i32 @main(ptr %0, i32 %1) { +; CHECK: call void @func.2(ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) + %3 = call i32 @func(ptr %0, i32 %1, ptr nonnull @increment) +; CHECK: call void @func.1(ptr [[TMP0]], i32 0) + %4 = call i32 @func(ptr %0, i32 %3, ptr nonnull @decrement) +; CHECK: ret i32 0 + ret i32 %4 +} + +; CHECK: @func.1( +; CHECK: [[TMP3:%.*]] = alloca i32, align 4 +; CHECK: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 +; CHECK: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK: [[TMP5:%.*]] = icmp slt i32 [[TMP4]], 1 +; CHECK: br i1 [[TMP5]], label [[TMP13:%.*]], label [[TMP6:%.*]] +; CHECK: 6: +; CHECK: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 [[TMP8]] +; CHECK: call void @decrement(ptr [[TMP9]]) +; CHECK: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 +; CHECK: call void @func.1(ptr [[TMP0]], i32 [[TMP11]]) +; CHECK: br label [[TMP12:%.*]] +; CHECK: 12: +; CHECK: ret void +; +; +; CHECK: @func.2( +; CHECK: [[TMP3:%.*]] = alloca i32, align 4 +; CHECK: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 +; CHECK: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK: [[TMP5:%.*]] = icmp slt i32 [[TMP4]], 1 +; CHECK: br i1 [[TMP5]], label [[TMP13:%.*]], label [[TMP6:%.*]] +; CHECK: 6: +; CHECK: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 [[TMP8]] +; CHECK: call void @increment(ptr [[TMP9]]) +; CHECK: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 +; CHECK: call void @func.2(ptr [[TMP0]], i32 [[TMP11]]) +; CHECK: br label [[TMP12:%.*]] +; CHECK: 12: +; CHECK: ret void diff --git a/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll b/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll index d80b6dfcf18aa..83e6b6660dc09 100644 --- a/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll +++ b/llvm/test/Transforms/FunctionSpecialization/function-specialization3.ll @@ -1,7 +1,9 @@ -; RUN: opt -passes="ipsccp" -S < %s | \ +; RUN: opt -passes="ipsccp" -funcspec-avg-loop-iters=3 -S < %s | \ ; RUN: FileCheck %s --check-prefixes=COMMON,DISABLED ; RUN: opt -passes="ipsccp" -force-specialization -S < %s | \ ; RUN: FileCheck %s --check-prefixes=COMMON,FORCE +; RUN: opt -passes="ipsccp" -funcspec-avg-loop-iters=3 -force-specialization -S < %s | \ +; RUN: FileCheck %s --check-prefixes=COMMON,FORCE ; Test for specializing a constant global. diff --git a/llvm/test/Transforms/FunctionSpecialization/get-possible-constants.ll b/llvm/test/Transforms/FunctionSpecialization/get-possible-constants.ll index 84231b1cae6e4..9b14db5399f3d 100644 --- a/llvm/test/Transforms/FunctionSpecialization/get-possible-constants.ll +++ b/llvm/test/Transforms/FunctionSpecialization/get-possible-constants.ll @@ -1,4 +1,4 @@ -; RUN: opt -S --passes="ipsccp" -force-specialization < %s | FileCheck %s +; RUN: opt -S --passes="ipsccp" < %s | FileCheck %s define dso_local i32 @p0(i32 noundef %x) { entry: %add = add nsw i32 %x, 1 diff --git a/llvm/test/Transforms/FunctionSpecialization/global-rank.ll b/llvm/test/Transforms/FunctionSpecialization/global-rank.ll index d46b73d156894..541faa2e19515 100644 --- a/llvm/test/Transforms/FunctionSpecialization/global-rank.ll +++ b/llvm/test/Transforms/FunctionSpecialization/global-rank.ll @@ -1,5 +1,4 @@ -; RUN: opt -S --passes="ipsccp" -funcspec-max-clones=1 -force-specialization < %s | FileCheck %s - +; RUN: opt -S --passes="ipsccp" -funcspec-max-clones=1 < %s | FileCheck %s define internal i32 @f(i32 noundef %x, ptr nocapture noundef readonly %p, ptr nocapture noundef readonly %q) noinline { entry: %call = tail call i32 %p(i32 noundef %x) diff --git a/llvm/test/Transforms/FunctionSpecialization/identical-specializations.ll b/llvm/test/Transforms/FunctionSpecialization/identical-specializations.ll index 2cfbf9dd7bdaa..c2ba0920c2be3 100644 --- a/llvm/test/Transforms/FunctionSpecialization/identical-specializations.ll +++ b/llvm/test/Transforms/FunctionSpecialization/identical-specializations.ll @@ -6,10 +6,10 @@ define i64 @main(i64 %x, i64 %y, i1 %flag) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[FLAG:%.*]], label [[PLUS:%.*]], label [[MINUS:%.*]] ; CHECK: plus: -; CHECK-NEXT: [[CMP0:%.*]] = call i64 @compute.2(i64 [[X:%.*]], i64 42, ptr @plus, ptr @minus) +; CHECK-NEXT: [[CMP0:%.*]] = call i64 @compute.2(i64 [[X:%.*]], i64 [[Y:%.*]], ptr @plus, ptr @minus) ; CHECK-NEXT: br label [[MERGE:%.*]] ; CHECK: minus: -; CHECK-NEXT: [[CMP1:%.*]] = call i64 @compute.3(i64 [[X]], i64 [[Y:%.*]], ptr @minus, ptr @plus) +; CHECK-NEXT: [[CMP1:%.*]] = call i64 @compute.3(i64 [[X]], i64 [[Y]], ptr @minus, ptr @plus) ; CHECK-NEXT: br label [[MERGE]] ; CHECK: merge: ; CHECK-NEXT: [[PH:%.*]] = phi i64 [ [[CMP0]], [[PLUS]] ], [ [[CMP1]], [[MINUS]] ] @@ -20,7 +20,7 @@ entry: br i1 %flag, label %plus, label %minus plus: - %cmp0 = call i64 @compute(i64 %x, i64 42, ptr @plus, ptr @minus) + %cmp0 = call i64 @compute(i64 %x, i64 %y, ptr @plus, ptr @minus) br label %merge minus: @@ -68,9 +68,9 @@ entry: ; CHECK-LABEL: @compute.2 ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP0:%.*]] = call i64 @plus(i64 [[X:%.*]], i64 42) -; CHECK-NEXT: [[CMP1:%.*]] = call i64 @minus(i64 [[X]], i64 42) -; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.1(i64 [[X]], i64 42, ptr @plus, ptr @plus) +; CHECK-NEXT: [[CMP0:%.*]] = call i64 @plus(i64 [[X:%.*]], i64 [[Y:%.*]]) +; CHECK-NEXT: [[CMP1:%.*]] = call i64 @minus(i64 [[X]], i64 [[Y]]) +; CHECK-NEXT: [[CMP2:%.*]] = call i64 @compute.1(i64 [[X]], i64 [[Y]], ptr @plus, ptr @plus) ; CHECK-LABEL: @compute.3 ; CHECK-NEXT: entry: diff --git a/llvm/test/Transforms/FunctionSpecialization/literal-const.ll b/llvm/test/Transforms/FunctionSpecialization/literal-const.ll index 479a841567ad7..fc400202ab91e 100644 --- a/llvm/test/Transforms/FunctionSpecialization/literal-const.ll +++ b/llvm/test/Transforms/FunctionSpecialization/literal-const.ll @@ -1,8 +1,7 @@ ; RUN: opt -S --passes="ipsccp" \ -; RUN: -funcspec-for-literal-constant=0 \ ; RUN: -force-specialization < %s | FileCheck %s -check-prefix CHECK-NOLIT ; RUN: opt -S --passes="ipsccp" \ -; RUN: -funcspec-for-literal-constant=1 \ +; RUN: -funcspec-for-literal-constant \ ; RUN: -force-specialization < %s | FileCheck %s -check-prefix CHECK-LIT define i32 @f0(i32 noundef %x) { diff --git a/llvm/test/Transforms/FunctionSpecialization/max-iters.ll b/llvm/test/Transforms/FunctionSpecialization/max-iters.ll deleted file mode 100644 index 76d60949f1ade..0000000000000 --- a/llvm/test/Transforms/FunctionSpecialization/max-iters.ll +++ /dev/null @@ -1,110 +0,0 @@ -; RUN: opt -passes="ipsccp,deadargelim" -force-specialization -S < %s | FileCheck %s --check-prefixes=COMMON,ITERS1 -; RUN: opt -passes="ipsccp,deadargelim" -funcspec-max-iters=1 -force-specialization -S < %s | FileCheck %s --check-prefixes=COMMON,ITERS1 -; RUN: opt -passes="ipsccp,deadargelim" -funcspec-max-iters=2 -force-specialization -S < %s | FileCheck %s --check-prefixes=COMMON,ITERS2 -; RUN: opt -passes="ipsccp,deadargelim" -funcspec-max-iters=0 -force-specialization -S < %s | FileCheck %s --check-prefix=DISABLED - -; DISABLED-NOT: @func.1( -; DISABLED-NOT: @func.2( -; DISABLED-NOT: @func.3( - -define internal i32 @func(ptr %0, i32 %1, ptr nocapture %2) { - %4 = alloca i32, align 4 - store i32 %1, ptr %4, align 4 - %5 = load i32, ptr %4, align 4 - %6 = icmp slt i32 %5, 1 - br i1 %6, label %14, label %7 - -7: ; preds = %3 - %8 = load i32, ptr %4, align 4 - %9 = sext i32 %8 to i64 - %10 = getelementptr inbounds i32, ptr %0, i64 %9 - call void %2(ptr %10) - %11 = load i32, ptr %4, align 4 - %12 = add nsw i32 %11, -1 - %13 = call i32 @func(ptr %0, i32 %12, ptr %2) - br label %14 - -14: ; preds = %3, %7 - ret i32 0 -} - -define internal void @increment(ptr nocapture %0) { - %2 = load i32, ptr %0, align 4 - %3 = add nsw i32 %2, 1 - store i32 %3, ptr %0, align 4 - ret void -} - -define internal void @decrement(ptr nocapture %0) { - %2 = load i32, ptr %0, align 4 - %3 = add nsw i32 %2, -1 - store i32 %3, ptr %0, align 4 - ret void -} - -define i32 @main(ptr %0, i32 %1) { -; COMMON: define i32 @main( -; COMMON-NEXT: call void @func.2(ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) -; COMMON-NEXT: call void @func.1(ptr [[TMP0]]) -; COMMON-NEXT: ret i32 0 -; - %3 = call i32 @func(ptr %0, i32 %1, ptr nonnull @increment) - %4 = call i32 @func(ptr %0, i32 %3, ptr nonnull @decrement) - ret i32 %4 -} - -; COMMON: define internal void @func.1( -; COMMON-NEXT: [[TMP2:%.*]] = alloca i32, align 4 -; COMMON-NEXT: store i32 0, ptr [[TMP2]], align 4 -; COMMON-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -; COMMON-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP3]], 1 -; COMMON-NEXT: br i1 [[TMP4]], label [[TMP11:%.*]], label [[TMP5:%.*]] -; COMMON: 5: -; COMMON-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4 -; COMMON-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 -; COMMON-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 [[TMP7]] -; COMMON-NEXT: call void @decrement(ptr [[TMP8]]) -; COMMON-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP2]], align 4 -; COMMON-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1 -; ITERS1-NEXT: call void @func(ptr [[TMP0]], i32 [[TMP10]], ptr @decrement) -; ITERS2-NEXT: call void @func.3(ptr [[TMP0]], i32 [[TMP10]]) -; COMMON-NEXT: br label [[TMP11:%.*]] -; COMMON: 11: -; COMMON-NEXT: ret void -; -; COMMON: define internal void @func.2( -; COMMON-NEXT: [[TMP3:%.*]] = alloca i32, align 4 -; COMMON-NEXT: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 -; COMMON-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 -; COMMON-NEXT: [[TMP5:%.*]] = icmp slt i32 [[TMP4]], 1 -; COMMON-NEXT: br i1 [[TMP5]], label [[TMP13:%.*]], label [[TMP6:%.*]] -; COMMON: 6: -; COMMON-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4 -; COMMON-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 -; COMMON-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 [[TMP8]] -; COMMON-NEXT: call void @increment(ptr [[TMP9]]) -; COMMON-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 -; COMMON-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 -; COMMON-NEXT: call void @func.2(ptr [[TMP0]], i32 [[TMP11]]) -; COMMON-NEXT: br label [[TMP12:%.*]] -; COMMON: 12: -; COMMON-NEXT: ret void -; -; ITERS2: define internal void @func.3( -; ITERS2-NEXT: [[TMP3:%.*]] = alloca i32, align 4 -; ITERS2-NEXT: store i32 [[TMP1:%.*]], ptr [[TMP3]], align 4 -; ITERS2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 -; ITERS2-NEXT: [[TMP5:%.*]] = icmp slt i32 [[TMP4]], 1 -; ITERS2-NEXT: br i1 [[TMP5]], label [[TMP13:%.*]], label [[TMP6:%.*]] -; ITERS2: 6: -; ITERS2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP3]], align 4 -; ITERS2-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 -; ITERS2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 [[TMP8]] -; ITERS2-NEXT: call void @decrement(ptr [[TMP9]]) -; ITERS2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4 -; ITERS2-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1 -; ITERS2-NEXT: call void @func.3(ptr [[TMP0]], i32 [[TMP11]]) -; ITERS2-NEXT: br label [[TMP12:%.*]] -; ITERS2: 12: -; ITERS2-NEXT: ret void - diff --git a/llvm/test/Transforms/FunctionSpecialization/noinline.ll b/llvm/test/Transforms/FunctionSpecialization/noinline.ll index bf66cf374c488..863e6e74eb23c 100644 --- a/llvm/test/Transforms/FunctionSpecialization/noinline.ll +++ b/llvm/test/Transforms/FunctionSpecialization/noinline.ll @@ -1,4 +1,4 @@ -; RUN: opt -S --passes="ipsccp" -funcspec-min-entry-freq=1 < %s | FileCheck %s +; RUN: opt -S --passes="ipsccp" < %s | FileCheck %s define dso_local i32 @p0(i32 noundef %x) { entry: %add = add nsw i32 %x, 1 diff --git a/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll b/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll index 3db1a8ce69a10..4233998ad9f6d 100644 --- a/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll +++ b/llvm/test/Transforms/FunctionSpecialization/remove-dead-recursive-function.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes="ipsccp" -force-specialization -S < %s | FileCheck %s +; RUN: opt -passes="ipsccp" -funcspec-min-function-size=3 -S < %s | FileCheck %s define i64 @main(i64 %x, i1 %flag) { entry: diff --git a/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll b/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll index 73006ae0fcb58..d1c23e07d5972 100644 --- a/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll +++ b/llvm/test/Transforms/FunctionSpecialization/specialize-multiple-arguments.ll @@ -1,12 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes="ipsccp" -funcspec-max-clones=0 -force-specialization -S < %s | FileCheck %s --check-prefix=NONE -; RUN: opt -passes="ipsccp" -funcspec-max-clones=1 -force-specialization -S < %s | FileCheck %s --check-prefix=ONE -; RUN: opt -passes="ipsccp" -funcspec-max-clones=2 -force-specialization -S < %s | FileCheck %s --check-prefix=TWO -; RUN: opt -passes="ipsccp" -funcspec-max-clones=3 -force-specialization -S < %s | FileCheck %s --check-prefix=THREE +; RUN: opt -passes="ipsccp" -funcspec-max-clones=0 -funcspec-min-function-size=14 -S < %s | FileCheck %s --check-prefix=NONE +; RUN: opt -passes="ipsccp" -funcspec-max-clones=1 -funcspec-min-function-size=14 -S < %s | FileCheck %s --check-prefix=ONE +; RUN: opt -passes="ipsccp" -funcspec-max-clones=2 -funcspec-min-function-size=14 -S < %s | FileCheck %s --check-prefix=TWO +; RUN: opt -passes="ipsccp" -funcspec-max-clones=3 -funcspec-min-function-size=14 -S < %s | FileCheck %s --check-prefix=THREE ; Make sure that we iterate correctly after sorting the specializations: -; -; Score(@plus, @minus) > Score(42, @minus, @power) > Score(@power, @mul) +; FnSpecialization: Specializations for function compute +; FnSpecialization: Gain = 608 +; FnSpecialization: FormalArg = binop1, ActualArg = power +; FnSpecialization: FormalArg = binop2, ActualArg = mul +; FnSpecialization: Gain = 982 +; FnSpecialization: FormalArg = binop1, ActualArg = plus +; FnSpecialization: FormalArg = binop2, ActualArg = minus +; FnSpecialization: Gain = 795 +; FnSpecialization: FormalArg = binop1, ActualArg = minus +; FnSpecialization: FormalArg = binop2, ActualArg = power define i64 @main(i64 %x, i64 %y, i1 %flag) { ; NONE-LABEL: @main( @@ -108,11 +116,11 @@ merge: ; ; THREE-LABEL: define internal i64 @compute.3(i64 %x, i64 %y, ptr %binop1, ptr %binop2) { ; THREE-NEXT: entry: -; THREE-NEXT: [[TMP0:%.+]] = call i64 @minus(i64 %x, i64 42) -; THREE-NEXT: [[TMP1:%.+]] = call i64 @power(i64 %x, i64 42) +; THREE-NEXT: [[TMP0:%.+]] = call i64 @minus(i64 %x, i64 %y) +; THREE-NEXT: [[TMP1:%.+]] = call i64 @power(i64 %x, i64 %y) ; THREE-NEXT: [[TMP2:%.+]] = add i64 [[TMP0]], [[TMP1]] ; THREE-NEXT: [[TMP3:%.+]] = sdiv i64 [[TMP2]], %x -; THREE-NEXT: [[TMP4:%.+]] = sub i64 [[TMP3]], 42 +; THREE-NEXT: [[TMP4:%.+]] = sub i64 [[TMP3]], %y ; THREE-NEXT: [[TMP5:%.+]] = mul i64 [[TMP4]], 2 ; THREE-NEXT: ret i64 [[TMP5]] ; THREE-NEXT: } diff --git a/llvm/unittests/Transforms/IPO/CMakeLists.txt b/llvm/unittests/Transforms/IPO/CMakeLists.txt index 4e4372179b46c..3b16d81ae3b29 100644 --- a/llvm/unittests/Transforms/IPO/CMakeLists.txt +++ b/llvm/unittests/Transforms/IPO/CMakeLists.txt @@ -12,7 +12,6 @@ add_llvm_unittest(IPOTests LowerTypeTests.cpp WholeProgramDevirt.cpp AttributorTest.cpp - FunctionSpecializationTest.cpp ) set_property(TARGET IPOTests PROPERTY FOLDER "Tests/UnitTests/TransformsTests") diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp deleted file mode 100644 index c6516bbe58051..0000000000000 --- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp +++ /dev/null @@ -1,261 +0,0 @@ -//===- FunctionSpecializationTest.cpp - Cost model unit tests -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" -#include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/AsmParser/Parser.h" -#include "llvm/IR/Constants.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Transforms/IPO/FunctionSpecialization.h" -#include "llvm/Transforms/Utils/SCCPSolver.h" -#include "gtest/gtest.h" -#include - -namespace llvm { - -class FunctionSpecializationTest : public testing::Test { -protected: - LLVMContext Ctx; - FunctionAnalysisManager FAM; - std::unique_ptr M; - std::unique_ptr Solver; - - FunctionSpecializationTest() { - FAM.registerPass([&] { return TargetLibraryAnalysis(); }); - FAM.registerPass([&] { return TargetIRAnalysis(); }); - FAM.registerPass([&] { return BlockFrequencyAnalysis(); }); - FAM.registerPass([&] { return BranchProbabilityAnalysis(); }); - FAM.registerPass([&] { return LoopAnalysis(); }); - FAM.registerPass([&] { return AssumptionAnalysis(); }); - FAM.registerPass([&] { return DominatorTreeAnalysis(); }); - FAM.registerPass([&] { return PostDominatorTreeAnalysis(); }); - FAM.registerPass([&] { return PassInstrumentationAnalysis(); }); - } - - Module &parseModule(const char *ModuleString) { - SMDiagnostic Err; - M = parseAssemblyString(ModuleString, Err, Ctx); - EXPECT_TRUE(M); - return *M; - } - - FunctionSpecializer getSpecializerFor(Function *F) { - auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & { - return FAM.getResult(F); - }; - auto GetTTI = [this](Function &F) -> TargetTransformInfo & { - return FAM.getResult(F); - }; - auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & { - return FAM.getResult(F); - }; - auto GetAC = [this](Function &F) -> AssumptionCache & { - return FAM.getResult(F); - }; - auto GetAnalysis = [this](Function &F) -> AnalysisResultsForFn { - DominatorTree &DT = FAM.getResult(F); - return { std::make_unique(F, DT, - FAM.getResult(F)), - &DT, FAM.getCachedResult(F) }; - }; - - Solver = std::make_unique(M->getDataLayout(), GetTLI, Ctx); - - Solver->addAnalysis(*F, GetAnalysis(*F)); - Solver->markBlockExecutable(&F->front()); - for (Argument &Arg : F->args()) - Solver->markOverdefined(&Arg); - Solver->solveWhileResolvedUndefsIn(*M); - - return FunctionSpecializer(*Solver, *M, &FAM, GetBFI, GetTLI, GetTTI, - GetAC); - } - - Cost getInstCost(Instruction &I) { - auto &TTI = FAM.getResult(*I.getFunction()); - auto &BFI = FAM.getResult(*I.getFunction()); - - uint64_t Weight = FunctionSpecializer::getBlockFreqMultiplier() * - BFI.getBlockFreq(I.getParent()).getFrequency() / - BFI.getEntryFreq(); - return Weight * - TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency); - } -}; - -} // namespace llvm - -using namespace llvm; - -TEST_F(FunctionSpecializationTest, SwitchInst) { - const char *ModuleString = R"( - define void @foo(i32 %a, i32 %b, i32 %i) { - entry: - switch i32 %i, label %default - [ i32 1, label %case1 - i32 2, label %case2 ] - case1: - %0 = mul i32 %a, 2 - %1 = sub i32 6, 5 - br label %bb1 - case2: - %2 = and i32 %b, 3 - %3 = sdiv i32 8, 2 - br label %bb2 - bb1: - %4 = add i32 %0, %b - br label %default - bb2: - %5 = or i32 %2, %a - br label %default - default: - ret void - } - )"; - - Module &M = parseModule(ModuleString); - Function *F = M.getFunction("foo"); - FunctionSpecializer Specializer = getSpecializerFor(F); - InstCostVisitor Visitor = Specializer.getInstCostVisitorFor(F); - - Constant *One = ConstantInt::get(IntegerType::getInt32Ty(M.getContext()), 1); - - auto FuncIter = F->begin(); - BasicBlock &Case1 = *++FuncIter; - BasicBlock &Case2 = *++FuncIter; - BasicBlock &BB1 = *++FuncIter; - BasicBlock &BB2 = *++FuncIter; - - Instruction &Mul = Case1.front(); - Instruction &And = Case2.front(); - Instruction &Sdiv = *++Case2.begin(); - Instruction &BrBB2 = Case2.back(); - Instruction &Add = BB1.front(); - Instruction &Or = BB2.front(); - Instruction &BrDefault = BB2.back(); - - // mul - Cost Ref = getInstCost(Mul); - Cost Bonus = Specializer.getSpecializationBonus(F->getArg(0), One, Visitor); - EXPECT_EQ(Bonus, Ref); - - // and + or + add - Ref = getInstCost(And) + getInstCost(Or) + getInstCost(Add); - Bonus = Specializer.getSpecializationBonus(F->getArg(1), One, Visitor); - EXPECT_EQ(Bonus, Ref); - - // sdiv + br + br - Ref = getInstCost(Sdiv) + getInstCost(BrBB2) + getInstCost(BrDefault); - Bonus = Specializer.getSpecializationBonus(F->getArg(2), One, Visitor); - EXPECT_EQ(Bonus, Ref); -} - -TEST_F(FunctionSpecializationTest, BranchInst) { - const char *ModuleString = R"( - define void @foo(i32 %a, i32 %b, i1 %cond) { - entry: - br i1 %cond, label %bb0, label %bb2 - bb0: - %0 = mul i32 %a, 2 - %1 = sub i32 6, 5 - br label %bb1 - bb1: - %2 = add i32 %0, %b - %3 = sdiv i32 8, 2 - br label %bb2 - bb2: - ret void - } - )"; - - Module &M = parseModule(ModuleString); - Function *F = M.getFunction("foo"); - FunctionSpecializer Specializer = getSpecializerFor(F); - InstCostVisitor Visitor = Specializer.getInstCostVisitorFor(F); - - Constant *One = ConstantInt::get(IntegerType::getInt32Ty(M.getContext()), 1); - Constant *False = ConstantInt::getFalse(M.getContext()); - - auto FuncIter = F->begin(); - BasicBlock &BB0 = *++FuncIter; - BasicBlock &BB1 = *++FuncIter; - - Instruction &Mul = BB0.front(); - Instruction &Sub = *++BB0.begin(); - Instruction &BrBB1 = BB0.back(); - Instruction &Add = BB1.front(); - Instruction &Sdiv = *++BB1.begin(); - Instruction &BrBB2 = BB1.back(); - - // mul - Cost Ref = getInstCost(Mul); - Cost Bonus = Specializer.getSpecializationBonus(F->getArg(0), One, Visitor); - EXPECT_EQ(Bonus, Ref); - - // add - Ref = getInstCost(Add); - Bonus = Specializer.getSpecializationBonus(F->getArg(1), One, Visitor); - EXPECT_EQ(Bonus, Ref); - - // sub + br + sdiv + br - Ref = getInstCost(Sub) + getInstCost(BrBB1) + getInstCost(Sdiv) + - getInstCost(BrBB2); - Bonus = Specializer.getSpecializationBonus(F->getArg(2), False, Visitor); - EXPECT_EQ(Bonus, Ref); -} - -TEST_F(FunctionSpecializationTest, Misc) { - const char *ModuleString = R"( - @g = constant [2 x i32] zeroinitializer, align 4 - - define i32 @foo(i8 %a, i1 %cond, ptr %b) { - %cmp = icmp eq i8 %a, 10 - %ext = zext i1 %cmp to i32 - %sel = select i1 %cond, i32 %ext, i32 1 - %gep = getelementptr i32, ptr %b, i32 %sel - %ld = load i32, ptr %gep - ret i32 %ld - } - )"; - - Module &M = parseModule(ModuleString); - Function *F = M.getFunction("foo"); - FunctionSpecializer Specializer = getSpecializerFor(F); - InstCostVisitor Visitor = Specializer.getInstCostVisitorFor(F); - - GlobalVariable *GV = M.getGlobalVariable("g"); - Constant *One = ConstantInt::get(IntegerType::getInt8Ty(M.getContext()), 1); - Constant *True = ConstantInt::getTrue(M.getContext()); - - auto BlockIter = F->front().begin(); - Instruction &Icmp = *BlockIter++; - Instruction &Zext = *BlockIter++; - Instruction &Select = *BlockIter++; - Instruction &Gep = *BlockIter++; - Instruction &Load = *BlockIter++; - - // icmp + zext - Cost Ref = getInstCost(Icmp) + getInstCost(Zext); - Cost Bonus = Specializer.getSpecializationBonus(F->getArg(0), One, Visitor); - EXPECT_EQ(Bonus, Ref); - - // select - Ref = getInstCost(Select); - Bonus = Specializer.getSpecializationBonus(F->getArg(1), True, Visitor); - EXPECT_EQ(Bonus, Ref); - - // gep + load - Ref = getInstCost(Gep) + getInstCost(Load); - Bonus = Specializer.getSpecializationBonus(F->getArg(2), GV, Visitor); - EXPECT_EQ(Bonus, Ref); -} From c8319cf95859935f28f53e80297c7ee62f36d8ac Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 30 May 2023 12:49:12 +0000 Subject: [PATCH 075/704] [gn build] Port 96a14f388b1a --- llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn index f5b162dd10232..3b5c5842dd5bf 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Transforms/IPO/BUILD.gn @@ -9,7 +9,6 @@ unittest("IPOTests") { ] sources = [ "AttributorTest.cpp", - "FunctionSpecializationTest.cpp", "LowerTypeTests.cpp", "WholeProgramDevirt.cpp", ] From 9ec52275acd6120db9a33d4f97d28848166cf839 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Tue, 30 May 2023 15:12:54 +0200 Subject: [PATCH 076/704] [mlir][linalg] FuseIntoContainingOp: Always set newContainingOp All result handles must be set in case of success. Differential Revision: https://reviews.llvm.org/D151705 --- .../Linalg/TransformOps/LinalgTransformOps.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index f18f24d4c3d9c..9233ce9b89bfb 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -699,11 +699,6 @@ transform::FuseIntoContainingOp::apply(transform::TransformResults &results, transform::TransformState &state) { SmallVector fusedOps; auto producerOps = state.getPayloadOps(getProducerOp()); - // If nothing to fuse, propagate success. - if (std::empty(producerOps)) { - results.set(cast(getFusedOp()), SmallVector{}); - return DiagnosedSilenceableFailure::success(); - } auto containingOps = state.getPayloadOps(getContainingOp()); if (!llvm::hasSingleElement(containingOps)) { return emitDefiniteFailure() @@ -712,6 +707,13 @@ transform::FuseIntoContainingOp::apply(transform::TransformResults &results, } Operation *containingOp = *containingOps.begin(); + // If nothing to fuse, propagate success. + if (std::empty(producerOps)) { + results.set(cast(getFusedOp()), SmallVector{}); + results.set(cast(getNewContainingOp()), {containingOp}); + return DiagnosedSilenceableFailure::success(); + } + // Helper function to find the next producer that should be fused. Take any // producer that has a use inside the containing op. SetVector remainingProducers(producerOps.begin(), From 2d731904170f1e3b378bfc556d939032e50c9a3d Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Tue, 30 May 2023 15:54:43 +0200 Subject: [PATCH 077/704] [mlir][linalg] Fix bug in FuseIntoContainingOp implementation Do not replace uses inside the body of `scf.forall` ops with results of the same op. Differential Revision: https://reviews.llvm.org/D151706 --- .../TransformOps/LinalgTransformOps.cpp | 3 +- .../transform-op-fuse-into-containing.mlir | 66 +++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 9233ce9b89bfb..a6a3fbb2e23b8 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -361,7 +361,8 @@ static Operation *replaceForAllWithNewSignature( SetVector dominatedUsers; DominanceInfo domInfo(containingOp); for (Operation *user : producerOp->getResult(resultNumber).getUsers()) { - if ((user != containingOp) && (domInfo.dominates(containingOp, user))) { + if (!containingOp->isAncestor(user) && + (domInfo.dominates(containingOp, user))) { dominatedUsers.insert(user); } } diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir index d67b4802e772a..3854cceb6273d 100644 --- a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir @@ -560,3 +560,69 @@ module { : (!transform.op<"linalg.generic">, !transform.op<"scf.forall">) -> (!transform.any_op, !transform.op<"scf.forall">) } } + +// ----- + +// This is a regression test. Make sure that the transform succeeds and valid +// IR is generated. + +module { + // CHECK-LABEL: func.func @softmax_dispatch_0_generic_16x128x128_f32 + func.func @softmax_dispatch_0_generic_16x128x128_f32() -> tensor<16x128x128xf32> { + %c0 = arith.constant 0 : index + %cst = arith.constant dense<5.000000e+00> : tensor<16x128x128xf32> + %cst_1 = arith.constant 5.000000e+00 : f32 + %1 = tensor.empty() : tensor<16x128xf32> + %2 = tensor.empty() : tensor<16x128x128xf32> + %3 = linalg.fill ins(%cst_1 : f32) outs(%1 : tensor<16x128xf32>) -> tensor<16x128xf32> + %4 = linalg.fill ins(%cst_1 : f32) outs(%1 : tensor<16x128xf32>) -> tensor<16x128xf32> + %5 = linalg.generic {producer, indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%cst : tensor<16x128x128xf32>) outs(%4 : tensor<16x128xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = arith.maxf %in, %out : f32 + linalg.yield %8 : f32 + } -> tensor<16x128xf32> + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %7 = scf.forall (%arg0, %arg1) in (16, 32) shared_outs(%arg2 = %2) -> (tensor<16x128x128xf32>) { + %11 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg1) + %extracted_slice = tensor.extract_slice %5[%arg0, %11] [1, 4] [1, 1] : tensor<16x128xf32> to tensor<1x4xf32> + %extracted_slice_3 = tensor.extract_slice %2[%arg0, %11, 0] [1, 4, 128] [1, 1, 1] : tensor<16x128x128xf32> to tensor<1x4x128xf32> + %extracted_slice_4 = tensor.extract_slice %3[%arg0, %11] [1, 4] [1, 1] : tensor<16x128xf32> to tensor<1x4xf32> + %15:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice : tensor<1x4xf32>) outs(%extracted_slice_3, %extracted_slice_4 : tensor<1x4x128xf32>, tensor<1x4xf32>) { + ^bb0(%in: f32, %out: f32, %out_9: f32): + %22 = arith.subf %cst_1, %in : f32 + %23 = math.exp %22 : f32 + %24 = arith.addf %23, %out_9 : f32 + linalg.yield %23, %24 : f32, f32 + } -> (tensor<1x4x128xf32>, tensor<1x4xf32>) + %extracted_slice_5 = tensor.extract_slice %5[%arg0, %11] [1, 4] [1, 1] : tensor<16x128xf32> to tensor<1x4xf32> + %extracted_slice_6 = tensor.extract_slice %2[%arg0, %11, 0] [1, 4, 128] [1, 1, 1] : tensor<16x128x128xf32> to tensor<1x4x128xf32> + %extracted_slice_7 = tensor.extract_slice %3[%arg0, %11] [1, 4] [1, 1] : tensor<16x128xf32> to tensor<1x4xf32> + %19:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%extracted_slice_5 : tensor<1x4xf32>) outs(%extracted_slice_6, %extracted_slice_7 : tensor<1x4x128xf32>, tensor<1x4xf32>) { + ^bb0(%in: f32, %out: f32, %out_9: f32): + %22 = arith.subf %cst_1, %in : f32 + %23 = math.exp %22 : f32 + %24 = arith.addf %23, %out_9 : f32 + linalg.yield %23, %24 : f32, f32 + } -> (tensor<1x4x128xf32>, tensor<1x4xf32>) + %extracted_slice_8 = tensor.extract_slice %arg2[%arg0, %11, 0] [1, 4, 128] [1, 1, 1] : tensor<16x128x128xf32> to tensor<1x4x128xf32> + %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15#0, %19#1 : tensor<1x4x128xf32>, tensor<1x4xf32>) outs(%extracted_slice_8 : tensor<1x4x128xf32>) { + ^bb0(%in: f32, %in_9: f32, %out: f32): + %22 = arith.divf %in, %in_9 : f32 + linalg.yield %22 : f32 + } -> tensor<1x4x128xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %20 into %arg2[%arg0, %11, 0] [1, 4, 128] [1, 1, 1] : tensor<1x4x128xf32> into tensor<16x128x128xf32> + } + } + return %7 : tensor<16x128x128xf32> + } + + transform.sequence failures(propagate) { + ^bb1(%arg1: !transform.any_op): + %0 = transform.structured.match attributes{producer} in %arg1 : (!transform.any_op) -> !transform.op<"linalg.generic"> + %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!transform.any_op) -> !transform.op<"scf.forall"> + transform.structured.fuse_into_containing_op %0 into %1 + : (!transform.op<"linalg.generic">, !transform.op<"scf.forall">) -> (!transform.any_op, !transform.any_op) + } +} From baefd6650cfea499e940e2c2f5718d9ae694df7a Mon Sep 17 00:00:00 2001 From: "Oleksandr \"Alex\" Zinenko" Date: Tue, 30 May 2023 16:00:59 +0200 Subject: [PATCH 078/704] [mlir] clarify transform.foreach_match documentation Clarify the restrictions on actions that are implied by the implementation as a post-order walk. --- mlir/include/mlir/Dialect/Transform/IR/TransformOps.td | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td index a313d285492d7..62b2cd698fb80 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td @@ -150,7 +150,12 @@ def ForeachMatchOp : TransformDialectOp<"foreach_match", [ satisfies the constraints of actual types on both sides). The action symbol may not have results. The actions are expected to only modify payload operations nested in the `root` payload operations associated with the - operand of this transform operation. + operand of this transform operation. Furhermore, the actions may not modify + operations outside of the currently matched payload operation, e.g., they + may not modify sibling or parent operations. If such behavior is desired, + the parent must be matched first and the nested operations obtained by + traversing the IR from the parent. This is due to the matching being + performed as a post-order IR walk. This operation consumes the operand and produces a new handle associated with the same payload. This is necessary to trigger invalidation of handles @@ -511,7 +516,7 @@ def NamedSequenceOp : TransformDialectOp<"named_sequence", return getResAttrs().value_or(nullptr); } }]; -} +}siblings must be matched explicitly def SplitHandleOp : TransformDialectOp<"split_handle", [FunctionalStyleTransformOpTrait, From 9d6f2b19072e9b8c216350cc5764645891248b8c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 30 May 2023 15:00:49 +0100 Subject: [PATCH 079/704] [X86] Fix SunnyCove ROB/MicroOpBufferSize As raised on Issue #62602 - the IceLake scheduler model is still mainly a copy of the SkylakeServer model. This initial commit just fixes the ROB/MicroOpBufferSize to match the size reported on WikiChip/Agner, further fixes to follow in later commits. --- llvm/lib/Target/X86/X86SchedIceLake.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index 4dfeafbca793f..ab0e2a95df722 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -19,7 +19,7 @@ def IceLakeModel : SchedMachineModel { // All x86 instructions are modeled as a single micro-op, and Ice Lake can // decode 6 instructions per cycle. let IssueWidth = 6; - let MicroOpBufferSize = 224; // Based on the reorder buffer. + let MicroOpBufferSize = 352; // Based on the reorder buffer. let LoadLatency = 5; let MispredictPenalty = 14; From 8378f1f4cdc8922e4f0409cabff25e0fef517bfa Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 30 May 2023 16:05:06 +0200 Subject: [PATCH 080/704] [InstCombine] Remove adjustMinMax() fold (PR62088) This fold is buggy if the constant adjustment overflows. Additionally, since we now canonicalize to min/max intrinsics, the constants picked here don't actually matter, as long as SPF still recognizes the pattern. Fixes https://github.com/llvm/llvm-project/issues/62088. --- .../InstCombine/InstCombineSelect.cpp | 96 +------------------ llvm/test/Transforms/InstCombine/select.ll | 42 ++++++++ 2 files changed, 43 insertions(+), 95 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 32b3c56dc9a21..7c93c2175aa95 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1094,99 +1094,6 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal, return nullptr; } -/// Return true if we find and adjust an icmp+select pattern where the compare -/// is with a constant that can be incremented or decremented to match the -/// minimum or maximum idiom. -static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) { - ICmpInst::Predicate Pred = Cmp.getPredicate(); - Value *CmpLHS = Cmp.getOperand(0); - Value *CmpRHS = Cmp.getOperand(1); - Value *TrueVal = Sel.getTrueValue(); - Value *FalseVal = Sel.getFalseValue(); - - // We may move or edit the compare, so make sure the select is the only user. - const APInt *CmpC; - if (!Cmp.hasOneUse() || !match(CmpRHS, m_APInt(CmpC))) - return false; - - // These transforms only work for selects of integers or vector selects of - // integer vectors. - Type *SelTy = Sel.getType(); - auto *SelEltTy = dyn_cast(SelTy->getScalarType()); - if (!SelEltTy || SelTy->isVectorTy() != Cmp.getType()->isVectorTy()) - return false; - - Constant *AdjustedRHS; - if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT) - AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC + 1); - else if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT) - AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC - 1); - else - return false; - - // X > C ? X : C+1 --> X < C+1 ? C+1 : X - // X < C ? X : C-1 --> X > C-1 ? C-1 : X - if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) || - (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) { - ; // Nothing to do here. Values match without any sign/zero extension. - } - // Types do not match. Instead of calculating this with mixed types, promote - // all to the larger type. This enables scalar evolution to analyze this - // expression. - else if (CmpRHS->getType()->getScalarSizeInBits() < SelEltTy->getBitWidth()) { - Constant *SextRHS = ConstantExpr::getSExt(AdjustedRHS, SelTy); - - // X = sext x; x >s c ? X : C+1 --> X = sext x; X X = sext x; X >s C-1 ? C-1 : X - // X = sext x; x >u c ? X : C+1 --> X = sext x; X X = sext x; X >u C-1 ? C-1 : X - if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) && SextRHS == FalseVal) { - CmpLHS = TrueVal; - AdjustedRHS = SextRHS; - } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) && - SextRHS == TrueVal) { - CmpLHS = FalseVal; - AdjustedRHS = SextRHS; - } else if (Cmp.isUnsigned()) { - Constant *ZextRHS = ConstantExpr::getZExt(AdjustedRHS, SelTy); - // X = zext x; x >u c ? X : C+1 --> X = zext x; X X = zext x; X >u C-1 ? C-1 : X - // zext + signed compare cannot be changed: - // 0xff s 0x0000 - if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) && ZextRHS == FalseVal) { - CmpLHS = TrueVal; - AdjustedRHS = ZextRHS; - } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) && - ZextRHS == TrueVal) { - CmpLHS = FalseVal; - AdjustedRHS = ZextRHS; - } else { - return false; - } - } else { - return false; - } - } else { - return false; - } - - Pred = ICmpInst::getSwappedPredicate(Pred); - CmpRHS = AdjustedRHS; - std::swap(FalseVal, TrueVal); - Cmp.setPredicate(Pred); - Cmp.setOperand(0, CmpLHS); - Cmp.setOperand(1, CmpRHS); - Sel.setOperand(1, TrueVal); - Sel.setOperand(2, FalseVal); - Sel.swapProfMetadata(); - - // Move the compare instruction right before the select instruction. Otherwise - // the sext/zext value may be defined after the compare instruction uses it. - Cmp.moveBefore(&Sel); - - return true; -} - static Instruction *canonicalizeSPF(SelectInst &Sel, ICmpInst &Cmp, InstCombinerImpl &IC) { Value *LHS, *RHS; @@ -1718,12 +1625,11 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, tryToReuseConstantFromSelectInComparison(SI, *ICI, *this)) return NewSel; - bool Changed = adjustMinMax(SI, *ICI); - if (Value *V = foldSelectICmpAnd(SI, ICI, Builder)) return replaceInstUsesWith(SI, V); // NOTE: if we wanted to, this is where to detect integer MIN/MAX + bool Changed = false; Value *TrueVal = SI.getTrueValue(); Value *FalseVal = SI.getFalseValue(); ICmpInst::Predicate Pred = ICI->getPredicate(); diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index ccb62b027c655..39aeaa577fa5c 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -3581,3 +3581,45 @@ define i32 @pr61361(i32 %arg) { %ashr = ashr i32 %sel2, 1 ret i32 %ashr } + +define i32 @pr62088() { +; CHECK-LABEL: @pr62088( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[NOT2:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ -2, [[LOOP]] ] +; CHECK-NEXT: [[H_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ 1, [[LOOP]] ] +; CHECK-NEXT: [[XOR1:%.*]] = or i32 [[H_0]], [[NOT2]] +; CHECK-NEXT: [[SUB5:%.*]] = sub i32 -1824888657, [[XOR1]] +; CHECK-NEXT: [[XOR6:%.*]] = xor i32 [[SUB5]], -1260914025 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[XOR6]], 824855120 +; CHECK-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret i32 [[H_0]] +; +entry: + br label %loop + +loop: + %not2 = phi i32 [ 0, %entry ], [ -2, %loop ] + %i.0 = phi i32 [ 0, %entry ], [ %shr, %loop ] + %h.0 = phi i32 [ 0, %entry ], [ 1, %loop ] + %i.0.fr = freeze i32 %i.0 + %sext = shl i32 %i.0.fr, 16 + %conv = ashr exact i32 %sext, 16 + %not = xor i32 %conv, -1 + %and = and i32 %h.0, 1 + %rem.urem = sub nsw i32 %and, %conv + %rem.cmp = icmp ult i32 %and, %conv + %rem = select i1 %rem.cmp, i32 %not, i32 %rem.urem + %xor = xor i32 %rem, %not2 + %sub = sub nsw i32 0, %xor + %sub5 = sub i32 -1824888657, %xor + %xor6 = xor i32 %sub5, -1260914025 + %cmp = icmp slt i32 %xor6, 824855120 + %shr = ashr i32 %xor6, 40 + br i1 %cmp, label %loop, label %exit + +exit: + ret i32 %rem +} From 6042a1ac18b02687655153fa45b2d30271bc2187 Mon Sep 17 00:00:00 2001 From: "Oleksandr \"Alex\" Zinenko" Date: Tue, 30 May 2023 16:07:51 +0200 Subject: [PATCH 081/704] [mlir] fix mis-merge --- mlir/include/mlir/Dialect/Transform/IR/TransformOps.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td index 62b2cd698fb80..9305b6b0859e2 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td @@ -516,7 +516,7 @@ def NamedSequenceOp : TransformDialectOp<"named_sequence", return getResAttrs().value_or(nullptr); } }]; -}siblings must be matched explicitly +} def SplitHandleOp : TransformDialectOp<"split_handle", [FunctionalStyleTransformOpTrait, From d70573b18e9af94dcae7de2287ca56c77da27e7c Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Thu, 25 May 2023 10:09:37 -0700 Subject: [PATCH 082/704] [RISCV][NFC] Make Reduction scheduler resources SEW aware Create SchedWrites, WriteRes for reduction instructions that are SEW specific. Future patches can use these resources to customize the behavior of these resources depending on SEW. Differential Revision: https://reviews.llvm.org/D151470 --- .../Target/RISCV/RISCVInstrInfoVPseudos.td | 68 ++++++---- llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 12 +- llvm/lib/Target/RISCV/RISCVScheduleV.td | 123 ++++++++++-------- 3 files changed, 119 insertions(+), 84 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 85046f1b40a3a..d0d4622877262 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -115,8 +115,14 @@ defvar MxListF = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8]; // Used for widening and narrowing instructions as it doesn't contain M8. defvar MxListW = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4]; +// Used for widening reductions. It can contain M8 because wider operands are +// scalar operands. +defvar MxListWRed = MxList; // For floating point which don't need MF8. defvar MxListFW = [V_MF4, V_MF2, V_M1, V_M2, V_M4]; +// For widening floating-point Reduction as it doesn't contain MF8. It can +// contain M8 because wider operands are scalar operands. +defvar MxListFWRed = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8]; // Use for zext/sext.vf2 defvar MxListVF2 = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8]; @@ -3180,16 +3186,14 @@ multiclass VPseudoTernaryWithTailPolicy_E { let VLMul = MInfo.value in { defvar mx = MInfo.MX; - defvar sews = SchedSEWSet.val; - foreach e = sews in { let isCommutable = Commutable in - def "_" # mx # "_E" # e : VPseudoTernaryNoMaskWithPolicy; - def "_" # mx # "_E" # e # "_MASK" : VPseudoBinaryTailPolicy; - } + def "_" # mx # "_E" # sew : VPseudoTernaryNoMaskWithPolicy; + def "_" # mx # "_E" # sew # "_MASK" : VPseudoBinaryTailPolicy; } } @@ -3448,50 +3452,60 @@ multiclass VPseudoVCMPM_VX_VI { multiclass VPseudoVRED_VS { foreach m = MxList in { defvar mx = m.MX; - defvar WriteVIRedV_From_MX = !cast("WriteVIRedV_From_" # mx); - defm _VS : VPseudoTernaryWithTailPolicy_E, - Sched<[WriteVIRedV_From_MX, ReadVIRedV, ReadVIRedV, ReadVIRedV, - ReadVMask]>; + foreach e = SchedSEWSet.val in { + defvar WriteVIRedV_From_MX_E = !cast("WriteVIRedV_From_" # mx # "_E" # e); + defm _VS : VPseudoTernaryWithTailPolicy_E, + Sched<[WriteVIRedV_From_MX_E, ReadVIRedV, ReadVIRedV, ReadVIRedV, + ReadVMask]>; + } } } multiclass VPseudoVWRED_VS { - foreach m = MxList in { + foreach m = MxListWRed in { defvar mx = m.MX; - defvar WriteVIWRedV_From_MX = !cast("WriteVIWRedV_From_" # mx); - defm _VS : VPseudoTernaryWithTailPolicy_E, - Sched<[WriteVIWRedV_From_MX, ReadVIWRedV, ReadVIWRedV, - ReadVIWRedV, ReadVMask]>; + foreach e = SchedSEWSet.val in { + defvar WriteVIWRedV_From_MX_E = !cast("WriteVIWRedV_From_" # mx # "_E" # e); + defm _VS : VPseudoTernaryWithTailPolicy_E, + Sched<[WriteVIWRedV_From_MX_E, ReadVIWRedV, ReadVIWRedV, + ReadVIWRedV, ReadVMask]>; + } } } multiclass VPseudoVFRED_VS { foreach m = MxListF in { defvar mx = m.MX; - defvar WriteVFRedV_From_MX = !cast("WriteVFRedV_From_" # mx); - defm _VS : VPseudoTernaryWithTailPolicy_E, - Sched<[WriteVFRedV_From_MX, ReadVFRedV, ReadVFRedV, ReadVFRedV, - ReadVMask]>; + foreach e = SchedSEWSetF.val in { + defvar WriteVFRedV_From_MX_E = !cast("WriteVFRedV_From_" # mx # "_E" # e); + defm _VS : VPseudoTernaryWithTailPolicy_E, + Sched<[WriteVFRedV_From_MX_E, ReadVFRedV, ReadVFRedV, ReadVFRedV, + ReadVMask]>; + } } } multiclass VPseudoVFREDO_VS { foreach m = MxListF in { defvar mx = m.MX; - defvar WriteVFRedOV_From_MX = !cast("WriteVFRedOV_From_" # mx); - defm _VS : VPseudoTernaryWithTailPolicy_E, - Sched<[WriteVFRedOV_From_MX, ReadVFRedOV, ReadVFRedOV, - ReadVFRedOV, ReadVMask]>; + foreach e = SchedSEWSetF.val in { + defvar WriteVFRedOV_From_MX_E = !cast("WriteVFRedOV_From_" # mx # "_E" # e); + defm _VS : VPseudoTernaryWithTailPolicy_E, + Sched<[WriteVFRedOV_From_MX_E, ReadVFRedOV, ReadVFRedOV, + ReadVFRedOV, ReadVMask]>; + } } } multiclass VPseudoVFWRED_VS { - foreach m = MxListF in { + foreach m = MxListFWRed in { defvar mx = m.MX; - defvar WriteVFWRedV_From_MX = !cast("WriteVFWRedV_From_" # mx); - defm _VS : VPseudoTernaryWithTailPolicy_E, - Sched<[WriteVFWRedV_From_MX, ReadVFWRedV, ReadVFWRedV, - ReadVFWRedV, ReadVMask]>; + foreach e = SchedSEWSetF.val in { + defvar WriteVFWRedV_From_MX_E = !cast("WriteVFWRedV_From_" # mx # "_E" # e); + defm _VS : VPseudoTernaryWithTailPolicy_E, + Sched<[WriteVFWRedV_From_MX_E, ReadVFWRedV, ReadVFWRedV, + ReadVFWRedV, ReadVMask]>; + } } } diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index d38051d2420ab..345dd90157e2f 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -620,12 +620,12 @@ foreach mx = SchedMxListFW in { // 14. Vector Reduction Operations let Latency = 32 in { -defm "" : LMULWriteRes<"WriteVIRedV_From", [SiFive7VA]>; -defm "" : LMULWriteRes<"WriteVIWRedV_From", [SiFive7VA]>; -defm "" : LMULWriteRes<"WriteVFRedV_From", [SiFive7VA]>; -defm "" : LMULWriteRes<"WriteVFRedOV_From", [SiFive7VA]>; -defm "" : LMULWriteResFWRed<"WriteVFWRedV_From", [SiFive7VA]>; -defm "" : LMULWriteResFWRed<"WriteVFWRedOV_From", [SiFive7VA]>; +defm "" : LMULSEWWriteRes<"WriteVIRedV_From", [SiFive7VA]>; +defm "" : LMULSEWWriteRes<"WriteVIWRedV_From", [SiFive7VA]>; +defm "" : LMULSEWWriteRes<"WriteVFRedV_From", [SiFive7VA]>; +defm "" : LMULSEWWriteRes<"WriteVFRedOV_From", [SiFive7VA]>; +defm "" : LMULSEWWriteResFWRed<"WriteVFWRedV_From", [SiFive7VA]>; +defm "" : LMULSEWWriteResFWRed<"WriteVFWRedOV_From", [SiFive7VA]>; } // 15. Vector Mask Instructions diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index b6ab10454cfd3..5863f170d5d98 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -12,30 +12,35 @@ defvar SchedMxList = ["MF8", "MF4", "MF2", "M1", "M2", "M4", "M8"]; // Used for widening and narrowing instructions as it doesn't contain M8. defvar SchedMxListW = !listremove(SchedMxList, ["M8"]); +// Used for widening reductions, which does contain M8. +defvar SchedMxListWRed = SchedMxList; defvar SchedMxListFW = !listremove(SchedMxList, ["M8", "MF8"]); // Used for floating-point as it doesn't contain MF8. defvar SchedMxListF = !listremove(SchedMxList, ["MF8"]); // Used for widening floating-point Reduction as it doesn't contain MF8. defvar SchedMxListFWRed = SchedMxListF; -class SchedSEWSet { - list val = !cond(!eq(mx, "M1"): [8, 16, 32, 64], - !eq(mx, "M2"): [8, 16, 32, 64], - !eq(mx, "M4"): [8, 16, 32, 64], - !eq(mx, "M8"): [8, 16, 32, 64], - !eq(mx, "MF2"): [8, 16, 32], - !eq(mx, "MF4"): [8, 16], - !eq(mx, "MF8"): [8]); +// For widening instructions, SEW will not be 64. +class SchedSEWSet { + defvar t = !cond(!eq(mx, "M1"): [8, 16, 32, 64], + !eq(mx, "M2"): [8, 16, 32, 64], + !eq(mx, "M4"): [8, 16, 32, 64], + !eq(mx, "M8"): [8, 16, 32, 64], + !eq(mx, "MF2"): [8, 16, 32], + !eq(mx, "MF4"): [8, 16], + !eq(mx, "MF8"): [8]); + list val = !if(isWidening, !listremove(t, [64]), t); } // For floating-point instructions, SEW won't be 8. -class SchedSEWSetF { - list val = !cond(!eq(mx, "M1"): [16, 32, 64], - !eq(mx, "M2"): [16, 32, 64], - !eq(mx, "M4"): [16, 32, 64], - !eq(mx, "M8"): [16, 32, 64], - !eq(mx, "MF2"): [16, 32], - !eq(mx, "MF4"): [16]); +class SchedSEWSetF { + defvar t = !cond(!eq(mx, "M1"): [16, 32, 64], + !eq(mx, "M2"): [16, 32, 64], + !eq(mx, "M4"): [16, 32, 64], + !eq(mx, "M8"): [16, 32, 64], + !eq(mx, "MF2"): [16, 32], + !eq(mx, "MF4"): [16]); + list val = !if(isWidening, !listremove(t, [64]), t); } // Helper function to get the largest LMUL from MxList @@ -102,34 +107,46 @@ multiclass LMULReadAdvanceImpl MxList, bit isF = 0> { +multiclass LMULSEWSchedWritesImpl MxList, bit isF = 0, + bit isWidening = 0> { def name # "_WorstCase" : SchedWrite; foreach mx = MxList in { - foreach sew = !if(isF, SchedSEWSetF.val, SchedSEWSet.val) in + foreach sew = !if(isF, SchedSEWSetF.val, + SchedSEWSet.val) in def name # "_" # mx # "_E" # sew : SchedWrite; } } -multiclass LMULSEWSchedReadsImpl MxList, bit isF = 0> { +multiclass LMULSEWSchedReadsImpl MxList, bit isF = 0, + bit isWidening = 0> { def name # "_WorstCase" : SchedRead; foreach mx = MxList in { - foreach sew = !if(isF, SchedSEWSetF.val, SchedSEWSet.val) in + foreach sew = !if(isF,SchedSEWSetF.val, + SchedSEWSet.val) in def name # "_" # mx # "_E" # sew : SchedRead; } } multiclass LMULSEWWriteResImpl resources, - bit isF = 0> { - def : WriteRes(name # "_WorstCase"), resources>; - foreach mx = !if(isF, SchedMxListF, SchedMxList) in { - foreach sew = !if(isF, SchedSEWSetF.val, SchedSEWSet.val) in - def : WriteRes(name # "_" # mx # "_E" # sew), resources>; + list MxList, bit isF = 0, + bit isWidening = 0> { + if !exists(name # "_WorstCase") then + def : WriteRes(name # "_WorstCase"), resources>; + foreach mx = MxList in { + foreach sew = !if(isF,SchedSEWSetF.val, + SchedSEWSet.val) in + if !exists(name # "_" # mx # "_E" # sew) then + def : WriteRes(name # "_" # mx # "_E" # sew), resources>; } } multiclass LMULSEWReadAdvanceImpl writes = [], - bit isF = 0> { - def : ReadAdvance(name # "_WorstCase"), val, writes>; - foreach mx = !if(isF, SchedMxListF, SchedMxList) in { - foreach sew = !if(isF, SchedSEWSetF.val, SchedSEWSet.val) in - def : ReadAdvance(name # "_" # mx # "_E" # sew), val, writes>; + list MxList, bit isF = 0, + bit isWidening = 0> { + if !exists(name # "_WorstCase") then + def : ReadAdvance(name # "_WorstCase"), val, writes>; + foreach mx = MxList in { + foreach sew = !if(isF,SchedSEWSetF.val, + SchedSEWSet.val) in + if !exists(name # "_" # mx # "_E" # sew) then + def : ReadAdvance(name # "_" # mx # "_E" # sew), val, writes>; } } // Define classes to define list containing all SchedWrites for each (name, LMUL) @@ -159,16 +176,26 @@ class LMULSchedWriteList names> : LMULSchedWriteListImpl : LMULSEWSchedWritesImpl; multiclass LMULSEWSchedReads : LMULSEWSchedReadsImpl; multiclass LMULSEWWriteRes resources> - : LMULSEWWriteResImpl; + : LMULSEWWriteResImpl; multiclass LMULSEWReadAdvance writes = []> - : LMULSEWReadAdvanceImpl; + : LMULSEWReadAdvanceImpl; + +multiclass LMULSEWSchedWritesWRed + : LMULSEWSchedWritesImpl; +multiclass LMULSEWWriteResWRed resources> + : LMULSEWWriteResImpl; + +multiclass LMULSEWSchedWritesFWRed + : LMULSEWSchedWritesImpl; +multiclass LMULSEWWriteResFWRed resources> + : LMULSEWWriteResImpl; multiclass LMULSEWSchedWritesF : LMULSEWSchedWritesImpl; multiclass LMULSEWSchedReadsF : LMULSEWSchedReadsImpl; multiclass LMULSEWWriteResF resources> - : LMULSEWWriteResImpl; + : LMULSEWWriteResImpl; multiclass LMULSEWReadAdvanceF writes = []> - : LMULSEWReadAdvanceImpl; + : LMULSEWReadAdvanceImpl; multiclass LMULSchedWritesW : LMULSchedWritesImpl; multiclass LMULSchedReadsW : LMULSchedReadsImpl; @@ -186,12 +213,6 @@ multiclass LMULReadAdvanceFW writes = []> : LMULReadAdvanceImpl; class LMULSchedWriteListFW names> : LMULSchedWriteListImpl; -multiclass LMULSchedWritesFWRed : LMULSchedWritesImpl; -multiclass LMULWriteResFWRed resources> - : LMULWriteResImpl; -class LMULSchedWriteListFWRed names> : LMULSchedWriteListImpl; - - // 3.6 Vector Byte Length vlenb def WriteRdVLENB : SchedWrite; @@ -389,15 +410,15 @@ defm "" : LMULSchedWritesFW<"WriteVFNCvtFToFV">; // MF8 and M8. Use the _From suffix to indicate the number of the // LMUL from VS2. // 14.1. Vector Single-Width Integer Reduction Instructions -defm "" : LMULSchedWrites<"WriteVIRedV_From">; +defm "" : LMULSEWSchedWrites<"WriteVIRedV_From">; // 14.2. Vector Widening Integer Reduction Instructions -defm "" : LMULSchedWrites<"WriteVIWRedV_From">; +defm "" : LMULSEWSchedWritesWRed<"WriteVIWRedV_From">; // 14.3. Vector Single-Width Floating-Point Reduction Instructions -defm "" : LMULSchedWrites<"WriteVFRedV_From">; -defm "" : LMULSchedWrites<"WriteVFRedOV_From">; +defm "" : LMULSEWSchedWritesF<"WriteVFRedV_From">; +defm "" : LMULSEWSchedWritesF<"WriteVFRedOV_From">; // 14.4. Vector Widening Floating-Point Reduction Instructions -defm "" : LMULSchedWritesFWRed<"WriteVFWRedV_From">; -defm "" : LMULSchedWritesFWRed<"WriteVFWRedOV_From">; +defm "" : LMULSEWSchedWritesFWRed<"WriteVFWRedV_From">; +defm "" : LMULSEWSchedWritesFWRed<"WriteVFWRedOV_From">; // 15. Vector Mask Instructions // 15.1. Vector Mask-Register Logical Instructions @@ -821,12 +842,12 @@ defm "" : LMULWriteResW<"WriteVFNCvtFToIV", []>; defm "" : LMULWriteResFW<"WriteVFNCvtFToFV", []>; // 14. Vector Reduction Operations -defm "" : LMULWriteRes<"WriteVIRedV_From", []>; -defm "" : LMULWriteRes<"WriteVIWRedV_From", []>; -defm "" : LMULWriteRes<"WriteVFRedV_From", []>; -defm "" : LMULWriteRes<"WriteVFRedOV_From", []>; -defm "" : LMULWriteResFWRed<"WriteVFWRedV_From", []>; -defm "" : LMULWriteResFWRed<"WriteVFWRedOV_From", []>; +defm "" : LMULSEWWriteRes<"WriteVIRedV_From", []>; +defm "" : LMULSEWWriteResWRed<"WriteVIWRedV_From", []>; +defm "" : LMULSEWWriteResF<"WriteVFRedV_From", []>; +defm "" : LMULSEWWriteResF<"WriteVFRedOV_From", []>; +defm "" : LMULSEWWriteResFWRed<"WriteVFWRedV_From", []>; +defm "" : LMULSEWWriteResFWRed<"WriteVFWRedOV_From", []>; // 15. Vector Mask Instructions defm "" : LMULWriteRes<"WriteVMALUV", []>; From 891fad0448fc560877e67c980754c1c4a5c83735 Mon Sep 17 00:00:00 2001 From: Kohei Yamaguchi Date: Tue, 30 May 2023 14:13:47 +0000 Subject: [PATCH 083/704] [mlir] [NFC] Add a newline to debug message at inserting of InterfaceMap At inserting of InterfaceMap, a debug message lacked a newline, so it repeatedly displayed this message within a single line. Clean up the debug log by inserting a newline at the end of the message. Reviewed By: rriddle Differential Revision: https://reviews.llvm.org/D150182 --- mlir/lib/Support/InterfaceSupport.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Support/InterfaceSupport.cpp b/mlir/lib/Support/InterfaceSupport.cpp index d813046eca7f3..4f321457dd2c8 100644 --- a/mlir/lib/Support/InterfaceSupport.cpp +++ b/mlir/lib/Support/InterfaceSupport.cpp @@ -25,7 +25,7 @@ void detail::InterfaceMap::insert(TypeID interfaceId, void *conceptImpl) { return compare(it.first, id); }); if (it != interfaces.end() && it->first == interfaceId) { - LLVM_DEBUG(llvm::dbgs() << "Ignoring repeated interface registration"); + LLVM_DEBUG(llvm::dbgs() << "Ignoring repeated interface registration\n"); free(conceptImpl); return; } From 544a240ff7ff5bbacd3d50692335a93665ded8d5 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 30 May 2023 07:07:11 -0700 Subject: [PATCH 084/704] [RISCV] Use v(f)slide1up for shuffle+insert idiom This is pretty straight forward in the basic form. I did need to move the slideup matching earlier, but that looks generally profitable on it's own. As follow ups, I plan to explore the v(f)slide1down variants, and see what I can do to canonicalize the shuffle then insert pattern (see _inverse tests at the end of the vslide1up.ll test). Differential Revision: https://reviews.llvm.org/D151468 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 22 +++- .../rvv/fixed-vector-shuffle-transpose.ll | 14 +-- .../rvv/fixed-vector-shuffle-vslide1up.ll | 106 +++++++++--------- 3 files changed, 74 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f7010228351d7..3dc04d0f29e93 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3731,6 +3731,20 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlideup(const SDLoc &DL, MVT VT, MVT XLenVT = Subtarget.getXLenVT(); MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); auto TrueMask = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).first; + if (Index == 1 && NumSubElts + Index == (int)NumElts && + isa(InPlace)) { + if (SDValue Splat = cast(InPlace)->getSplatValue()) { + auto OpCode = + VT.isFloatingPoint() ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VSLIDE1UP_VL; + auto Vec = DAG.getNode(OpCode, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), + convertToScalableVector(ContainerVT, ToInsert, DAG, Subtarget), + Splat, TrueMask, + DAG.getConstant(NumSubElts + Index, DL, XLenVT)); + return convertFromScalableVector(VT, Vec, DAG, Subtarget); + } + } + // We slide up by the index that the subvector is being inserted at, and set // VL to the index + the number of elements being inserted. unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED | RISCVII::MASK_AGNOSTIC; @@ -3967,6 +3981,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, Subtarget, DAG); } + if (SDValue V = + lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + // Detect an interleave shuffle and lower to // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1)) int EvenSrc, OddSrc; @@ -3989,10 +4007,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget); } - if (SDValue V = - lowerVECTOR_SHUFFLEAsVSlideup(DL, VT, V1, V2, Mask, Subtarget, DAG)) - return V; - // Detect shuffles which can be re-expressed as vector selects; these are // shuffles in which each element in the destination is taken from an element // at the corresponding index in either source vectors. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll index 688e882021068..37f67cad23e26 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-transpose.ll @@ -171,11 +171,8 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) { define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: trn1.v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> ret <2 x i32> %tmp0 @@ -256,11 +253,8 @@ define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) { define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) { ; CHECK-LABEL: trn1.v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v10, a0, v9 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> ret <2 x float> %tmp0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll index f7b667a36fa66..21fb38643bf2c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll @@ -8,11 +8,7 @@ define <2 x i8> @vslide1up_2xi8(<2 x i8> %v, i8 %b) { ; CHECK-LABEL: vslide1up_2xi8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vwaddu.vv v9, v10, v8 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v9, a0, v8 +; CHECK-NEXT: vslide1up.vx v9, v8, a0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <2 x i8> poison, i8 %b, i64 0 @@ -33,8 +29,7 @@ define <4 x i8> @vslide1up_4xi8(<4 x i8> %v, i8 %b) { ; RV64-LABEL: vslide1up_4xi8: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vslideup.vi v9, v8, 1 +; RV64-NEXT: vslide1up.vx v9, v8, a0 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <4 x i8> poison, i8 %b, i64 0 @@ -55,8 +50,7 @@ define <4 x i8> @vslide1up_4xi8_swapped(<4 x i8> %v, i8 %b) { ; RV64-LABEL: vslide1up_4xi8_swapped: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vslideup.vi v9, v8, 1 +; RV64-NEXT: vslide1up.vx v9, v8, a0 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <4 x i8> poison, i8 %b, i64 0 @@ -68,22 +62,16 @@ define <2 x i16> @vslide1up_2xi16(<2 x i16> %v, i16 %b) { ; RV32-LABEL: vslide1up_2xi16: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vmv.s.x v10, a0 -; RV32-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; RV32-NEXT: vwaddu.vv v9, v10, v8 -; RV32-NEXT: li a0, -1 -; RV32-NEXT: vwmaccu.vx v9, a0, v8 +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vslideup.vi v9, v8, 1 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: vslide1up_2xi16: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; RV64-NEXT: vwaddu.vv v9, v10, v8 -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vwmaccu.vx v9, a0, v8 +; RV64-NEXT: vslide1up.vx v9, v8, a0 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <2 x i16> poison, i16 %b, i64 0 @@ -95,8 +83,7 @@ define <4 x i16> @vslide1up_4xi16(<4 x i16> %v, i16 %b) { ; RV32-LABEL: vslide1up_4xi16: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vslideup.vi v9, v8, 1 +; RV32-NEXT: vslide1up.vx v9, v8, a0 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; @@ -117,22 +104,16 @@ define <2 x i32> @vslide1up_2xi32(<2 x i32> %v, i32 %b) { ; RV32-LABEL: vslide1up_2xi32: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v9, v10, v8 -; RV32-NEXT: li a0, -1 -; RV32-NEXT: vwmaccu.vx v9, a0, v8 +; RV32-NEXT: vslide1up.vx v9, v8, a0 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: ret ; ; RV64-LABEL: vslide1up_2xi32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-NEXT: vmv.s.x v10, a0 -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64-NEXT: vwaddu.vv v9, v10, v8 -; RV64-NEXT: li a0, -1 -; RV64-NEXT: vwmaccu.vx v9, a0, v8 +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vslideup.vi v9, v8, 1 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <2 x i32> poison, i32 %b, i64 0 @@ -144,8 +125,7 @@ define <4 x i32> @vslide1up_4xi32(<4 x i32> %v, i32 %b) { ; CHECK-LABEL: vslide1up_4xi32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.x v9, a0 -; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vslide1up.vx v9, v8, a0 ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <4 x i32> poison, i32 %b, i64 0 @@ -171,8 +151,7 @@ define <2 x i64> @vslide1up_2xi64(<2 x i64> %v, i64 %b) { ; RV64-LABEL: vslide1up_2xi64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a0 -; RV64-NEXT: vslideup.vi v9, v8, 1 +; RV64-NEXT: vslide1up.vx v9, v8, a0 ; RV64-NEXT: vmv.v.v v8, v9 ; RV64-NEXT: ret %vb = insertelement <2 x i64> poison, i64 %b, i64 0 @@ -198,8 +177,7 @@ define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) { ; RV64-LABEL: vslide1up_4xi64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: vslideup.vi v10, v8, 1 +; RV64-NEXT: vslide1up.vx v10, v8, a0 ; RV64-NEXT: vmv.v.v v8, v10 ; RV64-NEXT: ret %vb = insertelement <4 x i64> poison, i64 %b, i64 0 @@ -211,11 +189,7 @@ define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) { ; CHECK-LABEL: vslide1up_2xf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vwaddu.vv v9, v10, v8 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v9, a0, v8 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <2 x half> poison, half %b, i64 0 @@ -227,8 +201,7 @@ define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) { ; CHECK-LABEL: vslide1up_4xf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <4 x half> poison, half %b, i64 0 @@ -240,11 +213,7 @@ define <2 x float> @vslide1up_2xf32(<2 x float> %v, float %b) { ; CHECK-LABEL: vslide1up_2xf32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v9, v10, v8 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: vwmaccu.vx v9, a0, v8 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <2 x float> poison, float %b, i64 0 @@ -256,8 +225,7 @@ define <4 x float> @vslide1up_4xf32(<4 x float> %v, float %b) { ; CHECK-LABEL: vslide1up_4xf32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <4 x float> poison, float %b, i64 0 @@ -269,8 +237,7 @@ define <2 x double> @vslide1up_2xf64(<2 x double> %v, double %b) { ; CHECK-LABEL: vslide1up_2xf64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v9, fa0 -; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vfslide1up.vf v9, v8, fa0 ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %vb = insertelement <2 x double> poison, double %b, i64 0 @@ -291,6 +258,24 @@ define <4 x double> @vslide1up_4xf64(<4 x double> %v, double %b) { ret <4 x double> %v1 } +define <4 x i8> @vslide1up_4xi8_with_splat(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_4xi8_with_splat: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 14 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vadd.vi v10, v9, -1 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %vb = insertelement <4 x i8> poison, i8 %b, i64 0 + %v1 = shufflevector <4 x i8> %vb, <4 x i8> poison, <4 x i32> zeroinitializer + %v2 = shufflevector <4 x i8> %v1, <4 x i8> %v, <4 x i32> + ret <4 x i8> %v2 +} + define <2 x double> @vslide1up_v2f64_inverted(<2 x double> %v, double %b) { ; CHECK-LABEL: vslide1up_v2f64_inverted: ; CHECK: # %bb.0: @@ -320,7 +305,8 @@ define <4 x i8> @vslide1up_4xi8_inverted(<4 x i8> %v, i8 %b) { } -; The length of the shift is less than the suffix +; The length of the shift is less than the suffix, since we'd have to +; materailize the splat, using the vslide1up doesn't help us. define <4 x i32> @vslide1up_4xi32_neg1(<4 x i32> %v, i32 %b) { ; CHECK-LABEL: vslide1up_4xi32_neg1: ; CHECK: # %bb.0: @@ -335,3 +321,15 @@ define <4 x i32> @vslide1up_4xi32_neg1(<4 x i32> %v, i32 %b) { %v1 = shufflevector <4 x i32> %v, <4 x i32> %vb2, <4 x i32> ret <4 x i32> %v1 } + +; We don't know the scalar to do the vslide1up +define <4 x i32> @vslide1up_4xi32_neg2(<4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: vslide1up_4xi32_neg2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret + %res = shufflevector <4 x i32> %v1, <4 x i32> %v2, <4 x i32> + ret <4 x i32> %res +} From 206b8538a6df53d5245b7524d83501e027c52418 Mon Sep 17 00:00:00 2001 From: Jacob Crawley Date: Mon, 22 May 2023 13:07:28 +0000 Subject: [PATCH 085/704] [flang] add hlfir.all intrinsic Adds a new HLFIR operation for the ALL intrinsic according to the design set out in flang/docs/HighLevel.md Differential Revision: https://reviews.llvm.org/D151090 --- .../include/flang/Optimizer/HLFIR/HLFIROps.td | 21 ++++ flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp | 53 +++++--- flang/test/HLFIR/all.fir | 113 ++++++++++++++++++ flang/test/HLFIR/invalid.fir | 36 ++++++ 4 files changed, 207 insertions(+), 16 deletions(-) create mode 100644 flang/test/HLFIR/all.fir diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td index 15b92385a7720..142a70c639127 100644 --- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td +++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td @@ -317,6 +317,27 @@ def hlfir_ConcatOp : hlfir_Op<"concat", []> { let hasVerifier = 1; } +def hlfir_AllOp : hlfir_Op<"all", []> { + let summary = "ALL transformational intrinsic"; + let description = [{ + Takes a logical array MASK as argument, optionally along a particular dimension, + and returns true if all elements of MASK are true. + }]; + + let arguments = (ins + AnyFortranLogicalArrayObject:$mask, + Optional:$dim + ); + + let results = (outs AnyFortranValue); + + let assemblyFormat = [{ + $mask (`dim` $dim^)? attr-dict `:` functional-type(operands, results) + }]; + + let hasVerifier = 1; +} + def hlfir_AnyOp : hlfir_Op<"any", []> { let summary = "ANY transformational intrinsic"; let description = [{ diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp index 4547c4247241e..adf8b72993e4c 100644 --- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp +++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp @@ -442,16 +442,19 @@ mlir::LogicalResult hlfir::ParentComponentOp::verify() { } //===----------------------------------------------------------------------===// -// AnyOp +// LogicalReductionOp //===----------------------------------------------------------------------===// -mlir::LogicalResult hlfir::AnyOp::verify() { - mlir::Operation *op = getOperation(); +template +static mlir::LogicalResult +verifyLogicalReductionOp(LogicalReductionOp reductionOp) { + mlir::Operation *op = reductionOp->getOperation(); auto results = op->getResultTypes(); assert(results.size() == 1); - mlir::Value mask = getMask(); - mlir::Value dim = getDim(); + mlir::Value mask = reductionOp->getMask(); + mlir::Value dim = reductionOp->getDim(); + fir::SequenceType maskTy = hlfir::getFortranElementOrSequenceType(mask.getType()) .cast(); @@ -462,7 +465,7 @@ mlir::LogicalResult hlfir::AnyOp::verify() { if (mlir::isa(resultType)) { // Result is of the same type as MASK if (resultType != logicalTy) - return emitOpError( + return reductionOp->emitOpError( "result must have the same element type as MASK argument"); } else if (auto resultExpr = @@ -470,25 +473,42 @@ mlir::LogicalResult hlfir::AnyOp::verify() { // Result should only be in hlfir.expr form if it is an array if (maskShape.size() > 1 && dim != nullptr) { if (!resultExpr.isArray()) - return emitOpError("result must be an array"); + return reductionOp->emitOpError("result must be an array"); if (resultExpr.getEleTy() != logicalTy) - return emitOpError( + return reductionOp->emitOpError( "result must have the same element type as MASK argument"); llvm::ArrayRef resultShape = resultExpr.getShape(); // Result has rank n-1 if (resultShape.size() != (maskShape.size() - 1)) - return emitOpError("result rank must be one less than MASK"); + return reductionOp->emitOpError( + "result rank must be one less than MASK"); } else { - return emitOpError("result must be of logical type"); + return reductionOp->emitOpError("result must be of logical type"); } } else { - return emitOpError("result must be of logical type"); + return reductionOp->emitOpError("result must be of logical type"); } return mlir::success(); } +//===----------------------------------------------------------------------===// +// AllOp +//===----------------------------------------------------------------------===// + +mlir::LogicalResult hlfir::AllOp::verify() { + return verifyLogicalReductionOp(this); +} + +//===----------------------------------------------------------------------===// +// AnyOp +//===----------------------------------------------------------------------===// + +mlir::LogicalResult hlfir::AnyOp::verify() { + return verifyLogicalReductionOp(this); +} + //===----------------------------------------------------------------------===// // ConcatOp //===----------------------------------------------------------------------===// @@ -537,11 +557,12 @@ void hlfir::ConcatOp::build(mlir::OpBuilder &builder, } //===----------------------------------------------------------------------===// -// ReductionOp +// NumericalReductionOp //===----------------------------------------------------------------------===// -template -static mlir::LogicalResult verifyReductionOp(ReductionOp reductionOp) { +template +static mlir::LogicalResult +verifyNumericalReductionOp(NumericalReductionOp reductionOp) { mlir::Operation *op = reductionOp->getOperation(); auto results = op->getResultTypes(); @@ -619,7 +640,7 @@ static mlir::LogicalResult verifyReductionOp(ReductionOp reductionOp) { //===----------------------------------------------------------------------===// mlir::LogicalResult hlfir::ProductOp::verify() { - return verifyReductionOp(this); + return verifyNumericalReductionOp(this); } //===----------------------------------------------------------------------===// @@ -645,7 +666,7 @@ void hlfir::SetLengthOp::build(mlir::OpBuilder &builder, //===----------------------------------------------------------------------===// mlir::LogicalResult hlfir::SumOp::verify() { - return verifyReductionOp(this); + return verifyNumericalReductionOp(this); } //===----------------------------------------------------------------------===// diff --git a/flang/test/HLFIR/all.fir b/flang/test/HLFIR/all.fir new file mode 100644 index 0000000000000..00ce1b3a5fbae --- /dev/null +++ b/flang/test/HLFIR/all.fir @@ -0,0 +1,113 @@ +// Test hlfir.all operation parse, verify (no errors), and unparse + +// RUN: fir-opt %s | fir-opt | FileCheck %s + +// mask is an expression of known shape +func.func @all0(%arg0: !hlfir.expr<2x!fir.logical<4>>) { + %all = hlfir.all %arg0 : (!hlfir.expr<2x!fir.logical<4>>) -> !fir.logical<4> + return +} +// CHECK: func.func @all0(%[[ARRAY:.*]]: !hlfir.expr<2x!fir.logical<4>>) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] : (!hlfir.expr<2x!fir.logical<4>>) -> !fir.logical<4> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// mask is an expression of assumed shape +func.func @all1(%arg0: !hlfir.expr>) { + %all = hlfir.all %arg0 : (!hlfir.expr>) -> !fir.logical<4> + return +} +// CHECK: func.func @all1(%[[ARRAY:.*]]: !hlfir.expr>) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] : (!hlfir.expr>) -> !fir.logical<4> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// mask is a boxed array +func.func @all2(%arg0: !fir.box>>) { + %all = hlfir.all %arg0 : (!fir.box>>) -> !fir.logical<4> + return +} +// CHECK: func.func @all2(%[[ARRAY:.*]]: !fir.box>>) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] : (!fir.box>>) -> !fir.logical<4> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// mask is an assumed shape boxed array +func.func @all3(%arg0: !fir.box>>){ + %all = hlfir.all %arg0 : (!fir.box>>) -> !fir.logical<4> + return +} +// CHECK: func.func @all3(%[[ARRAY:.*]]: !fir.box>>) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] : (!fir.box>>) -> !fir.logical<4> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// mask is a 2-dimensional array +func.func @all4(%arg0: !fir.box>>){ + %all = hlfir.all %arg0 : (!fir.box>>) -> !fir.logical<4> + return +} +// CHECK: func.func @all4(%[[ARRAY:.*]]: !fir.box>>) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] : (!fir.box>>) -> !fir.logical<4> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// mask and dim argument +func.func @all5(%arg0: !fir.box>>, %arg1: i32) { + %all = hlfir.all %arg0 dim %arg1 : (!fir.box>>, i32) -> !fir.logical<4> + return +} +// CHECK: func.func @all5(%[[ARRAY:.*]]: !fir.box>>, %[[DIM:.*]]: i32) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] dim %[[DIM]] : (!fir.box>>, i32) -> !fir.logical<4> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// hlfir.all with dim argument with an unusual type +func.func @all6(%arg0: !fir.box>>, %arg1: index) { + %all = hlfir.all %arg0 dim %arg1 : (!fir.box>>, index) ->!fir.logical<4> + return +} +// CHECK: func.func @all6(%[[ARRAY:.*]]: !fir.box>>, %[[DIM:.*]]: index) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] dim %[[DIM]] : (!fir.box>>, index) -> !fir.logical<4> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// mask is a 2 dimensional array with dim +func.func @all7(%arg0: !fir.box>>, %arg1: i32) { + %all = hlfir.all %arg0 dim %arg1 : (!fir.box>>, i32) -> !hlfir.expr> + return +} +// CHECK: func.func @all7(%[[ARRAY:.*]]: !fir.box>>, %[[DIM:.*]]: i32) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] dim %[[DIM]] : (!fir.box>>, i32) -> !hlfir.expr> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// known shape expr return +func.func @all8(%arg0: !fir.box>>, %arg1: i32) { + %all = hlfir.all %arg0 dim %arg1 : (!fir.box>>, i32) -> !hlfir.expr<2x!fir.logical<4>> + return +} +// CHECK: func.func @all8(%[[ARRAY:.*]]: !fir.box>>, %[[DIM:.*]]: i32) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] dim %[[DIM]] : (!fir.box>>, i32) -> !hlfir.expr<2x!fir.logical<4>> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// hlfir.all with mask argument of ref> type +func.func @all9(%arg0: !fir.ref>>) { + %all = hlfir.all %arg0 : (!fir.ref>>) -> !fir.logical<4> + return +} +// CHECK: func.func @all9(%[[ARRAY:.*]]: !fir.ref>>) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] : (!fir.ref>>) -> !fir.logical<4> +// CHECK-NEXT: return +// CHECK-NEXT: } + +// hlfir.all with fir.logical<8> type +func.func @all10(%arg0: !fir.box>>) { + %all = hlfir.all %arg0 : (!fir.box>>) -> !fir.logical<8> + return +} +// CHECK: func.func @all10(%[[ARRAY:.*]]: !fir.box>>) { +// CHECK-NEXT: %[[ALL:.*]] = hlfir.all %[[ARRAY]] : (!fir.box>>) -> !fir.logical<8> +// CHECK-NEXT: return +// CHECK-NEXT: } \ No newline at end of file diff --git a/flang/test/HLFIR/invalid.fir b/flang/test/HLFIR/invalid.fir index e1c95c1046dc4..8dc5679346bc1 100644 --- a/flang/test/HLFIR/invalid.fir +++ b/flang/test/HLFIR/invalid.fir @@ -332,6 +332,42 @@ func.func @bad_any6(%arg0: !hlfir.expr>) { %0 = hlfir.any %arg0 : (!hlfir.expr>) -> !hlfir.expr> } +// ----- +func.func @bad_all1(%arg0: !hlfir.expr>) { + // expected-error@+1 {{'hlfir.all' op result must have the same element type as MASK argument}} + %0 = hlfir.all %arg0 : (!hlfir.expr>) -> !fir.logical<8> +} + +// ----- +func.func @bad_all2(%arg0: !hlfir.expr>, %arg1: i32) { + // expected-error@+1 {{'hlfir.all' op result must have the same element type as MASK argument}} + %0 = hlfir.all %arg0 dim %arg1 : (!hlfir.expr>, i32) -> !hlfir.expr> +} + +// ----- +func.func @bad_all3(%arg0: !hlfir.expr>, %arg1: i32){ + // expected-error@+1 {{'hlfir.all' op result rank must be one less than MASK}} + %0 = hlfir.all %arg0 dim %arg1 : (!hlfir.expr>, i32) -> !hlfir.expr> +} + +// ----- +func.func @bad_all4(%arg0: !hlfir.expr>, %arg1: i32) { + // expected-error@+1 {{'hlfir.all' op result must be an array}} + %0 = hlfir.all %arg0 dim %arg1 : (!hlfir.expr>, i32) -> !hlfir.expr> +} + +// ----- +func.func @bad_all5(%arg0: !hlfir.expr>) { + // expected-error@+1 {{'hlfir.all' op result must be of logical type}} + %0 = hlfir.all %arg0 : (!hlfir.expr>) -> i32 +} + +// ----- +func.func @bad_all6(%arg0: !hlfir.expr>) { + // expected-error@+1 {{'hlfir.all' op result must be of logical type}} + %0 = hlfir.all %arg0 : (!hlfir.expr>) -> !hlfir.expr> +} + // ----- func.func @bad_product1(%arg0: !hlfir.expr, %arg1: i32, %arg2: !fir.box>) { // expected-error@+1 {{'hlfir.product' op result must have the same element type as ARRAY argument}} From 9e7699a21bd29d73390fd8d55821c481c8e5e542 Mon Sep 17 00:00:00 2001 From: Jacob Crawley Date: Mon, 22 May 2023 15:06:49 +0000 Subject: [PATCH 086/704] [flang] lower all intrinsic to hlfir.all operation Carries out the lowering of the all intrinsic into HLFIR Differential Revision: https://reviews.llvm.org/D151111 --- flang/lib/Lower/ConvertCall.cpp | 43 +++++++++++------- flang/test/Lower/HLFIR/all.f90 | 80 +++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 16 deletions(-) create mode 100644 flang/test/Lower/HLFIR/all.f90 diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp index 53ab160f6c089..66af19b94e78d 100644 --- a/flang/lib/Lower/ConvertCall.cpp +++ b/flang/lib/Lower/ConvertCall.cpp @@ -1407,22 +1407,38 @@ genHLFIRIntrinsicRefCore(PreparedActualArguments &loweredActuals, return builder.create(loc, resultTy, array, dim, mask); }; + auto buildAnyOperation = [](fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Type resultTy, mlir::Value array, + mlir::Value dim, mlir::Value mask) { + return builder.create(loc, resultTy, array, dim); + }; + + auto buildAllOperation = [](fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Type resultTy, mlir::Value array, + mlir::Value dim, mlir::Value mask) { + return builder.create(loc, resultTy, array, dim); + }; + auto buildReductionIntrinsic = [&](PreparedActualArguments &loweredActuals, mlir::Location loc, fir::FirOpBuilder &builder, CallContext &callContext, std::function - buildFunc) -> std::optional { + buildFunc, + bool hasMask) -> std::optional { // shared logic for building the product and sum operations llvm::SmallVector operands = getOperandVector(loweredActuals); - assert(operands.size() == 3); // dim, mask can be NULL if these arguments were not given mlir::Value array = operands[0]; mlir::Value dim = operands[1]; if (dim) dim = hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{dim}); - mlir::Value mask = operands[2]; + + mlir::Value mask; + if (hasMask) + mask = operands[2]; + mlir::Type resultTy = computeResultType(array, *callContext.resultType); auto *intrinsicOp = buildFunc(builder, loc, resultTy, array, dim, mask); return {hlfir::EntityWithAttributes{intrinsicOp->getResult(0)}}; @@ -1431,11 +1447,11 @@ genHLFIRIntrinsicRefCore(PreparedActualArguments &loweredActuals, const std::string intrinsicName = callContext.getProcedureName(); if (intrinsicName == "sum") { return buildReductionIntrinsic(loweredActuals, loc, builder, callContext, - buildSumOperation); + buildSumOperation, true); } if (intrinsicName == "product") { return buildReductionIntrinsic(loweredActuals, loc, builder, callContext, - buildProductOperation); + buildProductOperation, true); } if (intrinsicName == "matmul") { llvm::SmallVector operands = getOperandVector(loweredActuals); @@ -1465,17 +1481,12 @@ genHLFIRIntrinsicRefCore(PreparedActualArguments &loweredActuals, return {hlfir::EntityWithAttributes{transposeOp.getResult()}}; } if (intrinsicName == "any") { - llvm::SmallVector operands = getOperandVector(loweredActuals); - assert(operands.size() == 2); - // dim argument can be NULL if not given - mlir::Value mask = operands[0]; - mlir::Value dim = operands[1]; - if (dim) - dim = hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{dim}); - mlir::Type resultTy = computeResultType(mask, *callContext.resultType); - hlfir::AnyOp anyOp = builder.create(loc, resultTy, mask, dim); - - return {hlfir::EntityWithAttributes{anyOp.getResult()}}; + return buildReductionIntrinsic(loweredActuals, loc, builder, callContext, + buildAnyOperation, false); + } + if (intrinsicName == "all") { + return buildReductionIntrinsic(loweredActuals, loc, builder, callContext, + buildAllOperation, false); } // TODO add hlfir operations for other transformational intrinsics here diff --git a/flang/test/Lower/HLFIR/all.f90 b/flang/test/Lower/HLFIR/all.f90 new file mode 100644 index 0000000000000..080039af9b12c --- /dev/null +++ b/flang/test/Lower/HLFIR/all.f90 @@ -0,0 +1,80 @@ +! Test lowering of ALL intrinsic to HLFIR +! RUN: bbc -emit-fir -hlfir -o - %s 2>&1 | FileCheck %s + +! simple 1 argument ALL +subroutine all1(a, s) + logical :: a(:), s + s = ALL(a) +end subroutine +! CHECK-LABEL: func.func @_QPall1( +! CHECK: %[[ARG0:.*]]: !fir.box>> +! CHECK: %[[ARG1:.*]]: !fir.ref> +! CHECK-DAG: %[[MASK:.*]]:2 = hlfir.declare %[[ARG0]] +! CHECK-DAG: %[[OUT:.*]]:2 = hlfir.declare %[[ARG1]] +! CHECK-NEXT: %[[EXPR:.*]] = hlfir.all %[[MASK]]#0 : (!fir.box>>) -> !fir.logical<4> +! CHECK-NEXT: hlfir.assign %[[EXPR]] to %[[OUT]]#0 : !fir.logical<4>, !fir.ref> +! CHECK-NEXT: return +! CHECK-NEXT: } + +! all with by-ref DIM argument +subroutine all2(a, s, d) + logical :: a(:,:), s(:) + integer :: d +s = ALL(a, d) +end subroutine +! CHECK-LABEL: func.func @_QPall2( +! CHECK: %[[ARG0:.*]]: !fir.box>> {fir.bindc_name = "a"} +! CHECK: %[[ARG1:.*]]: !fir.box>> {fir.bindc_name = "s"} +! CHECK: %[[ARG2:.*]]: !fir.ref {fir.bindc_name = "d"} +! CHECK-DAG: %[[MASK:.*]]:2 = hlfir.declare %[[ARG0]] +! CHECK-DAG: %[[DIM_REF:.*]]:2 = hlfir.declare %[[ARG2]] +! CHECK-DAG: %[[OUT:.*]]:2 = hlfir.declare %[[ARG1]] +! CHECK-NEXT: %[[DIM:.*]] = fir.load %[[DIM_REF]]#0 : !fir.ref +! CHECK-NEXT: %[[EXPR:.*]] = hlfir.all %[[MASK]]#0 dim %[[DIM]] : (!fir.box>>, i32) -> !hlfir.expr> +! CHECK-NEXT: hlfir.assign %[[EXPR]] to %[[OUT]]#0 : !hlfir.expr>, !fir.box>> +! CHECK-NEXT: hlfir.destroy %[[EXPR]] +! CHECK-NEXT: return +! CHECK-NEXT: } + +! all with DIM argument by-val, mask isn't boxed +subroutine all3(s) + logical :: s(2) + logical :: a(2,2) = reshape((/.true.,.false.,.true.,.false./), shape(a)) +s = ALL(a, 1) +end subroutine +! CHECK-LABEL: func.func @_QPall3( +! CHECK: %[[ARG0:.*]]: !fir.ref>> {fir.bindc_name = "s"} +! CHECK-DAG: %[[ADDR:.*]] = fir.address_of{{.*}} : !fir.ref>> +! CHECK-DAG: %[[MASK_SHAPE:.*]] = fir.shape {{.*}} -> !fir.shape<2> +! CHECK-DAG: %[[MASK:.*]]:2 = hlfir.declare %[[ADDR]](%[[MASK_SHAPE]]) +! CHECK-DAG: %[[OUT_SHAPE:.*]] = fir.shape {{.*}} -> !fir.shape<1> +! CHECK-DAG: %[[OUT:.*]]:2 = hlfir.declare %[[ARG0]](%[[OUT_SHAPE]]) +! CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32 +! CHECK-DAG: %[[EXPR:.*]] = hlfir.all %[[MASK]]#0 dim %[[C1]] : (!fir.ref>>, i32) -> !hlfir.expr<2x!fir.logical<4>> +! CHECK-DAG: hlfir.assign %[[EXPR]] to %[[OUT]] +! CHECK-NEXT: hlfir.destroy %[[EXPR]] : !hlfir.expr<2x!fir.logical<4>> +! CHECK-NEXT: return +! CHECK-NEXT: } + +! all with DIM from pointer +subroutine all4(a, s, d) + integer, pointer :: d + logical :: a(:,:), s(:) + s = ALL(a, (d)) +end subroutine +! CHECK-LABEL: func.func @_QPall4( +! CHECK: %[[ARG0:.*]]: !fir.box>> {fir.bindc_name = "a"} +! CHECK: %[[ARG1:.*]]: !fir.box>> {fir.bindc_name = "s"} +! CHECK: %[[ARG2:.*]]: !fir.ref>> {fir.bindc_name = "d"} +! CHECK-DAG: %[[ARRAY:.*]]:2 = hlfir.declare %[[ARG0]] +! CHECK-DAG: %[[OUT:.*]]:2 = hlfir.declare %[[ARG1]] +! CHECK-DAG: %[[DIM:.*]]:2 = hlfir.declare %[[ARG2]] +! CHECK-NEXT: %[[DIM_BOX:.*]] = fir.load %[[DIM]]#0 : !fir.ref>> +! CHECK-NEXT: %[[DIM_ADDR:.*]] = fir.box_addr %[[DIM_BOX]] : (!fir.box>) -> !fir.ptr +! CHECK-NEXT: %[[DIM0:.*]] = fir.load %[[DIM_ADDR]] : !fir.ptr +! CHECK-NEXT: %[[DIM1:.*]] = hlfir.no_reassoc %[[DIM0]] : i32 +! CHECK-NEXT: %[[EXPR:.*]] = hlfir.all %[[ARRAY]]#0 dim %[[DIM1]] : (!fir.box>>, i32) -> !hlfir.expr> +! CHECK-NEXT: hlfir.assign %[[EXPR]] to %[[OUT]]#0 : !hlfir.expr>, !fir.box>> +! CHECK-NEXT: hlfir.destroy %[[EXPR]] : !hlfir.expr> +! CHECK-NEXT: return +! CHECK-NEXT: } From 8a6dadaad096fe6ded1e130487bd104b99fb008d Mon Sep 17 00:00:00 2001 From: Jacob Crawley Date: Wed, 24 May 2023 15:38:03 +0000 Subject: [PATCH 087/704] [flang][hlfir] lower hlfir.all into runtime call Depends on: D151111 Differential Revision: https://reviews.llvm.org/D151415 --- .../HLFIR/Transforms/LowerHLFIRIntrinsics.cpp | 86 +++++----- flang/test/HLFIR/all-lowering.fir | 157 ++++++++++++++++++ 2 files changed, 201 insertions(+), 42 deletions(-) create mode 100644 flang/test/HLFIR/all-lowering.fir diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp index 1cf3929c1c043..0ffb2ac9ca0cb 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp @@ -168,6 +168,30 @@ class HlfirReductionIntrinsicConversion : public HlfirIntrinsicConversion { using HlfirIntrinsicConversion::HlfirIntrinsicConversion; using IntrinsicArgument = typename HlfirIntrinsicConversion::IntrinsicArgument; + using HlfirIntrinsicConversion::lowerArguments; + using HlfirIntrinsicConversion::processReturnValue; + +protected: + auto buildNumericalArgs(OP operation, mlir::Type i32, mlir::Type logicalType, + mlir::PatternRewriter &rewriter, + std::string opName) const { + llvm::SmallVector inArgs; + inArgs.push_back({operation.getArray(), operation.getArray().getType()}); + inArgs.push_back({operation.getDim(), i32}); + inArgs.push_back({operation.getMask(), logicalType}); + auto *argLowering = fir::getIntrinsicArgumentLowering(opName); + return lowerArguments(operation, inArgs, rewriter, argLowering); + }; + + auto buildLogicalArgs(OP operation, mlir::Type i32, mlir::Type logicalType, + mlir::PatternRewriter &rewriter, + std::string opName) const { + llvm::SmallVector inArgs; + inArgs.push_back({operation.getMask(), logicalType}); + inArgs.push_back({operation.getDim(), i32}); + auto *argLowering = fir::getIntrinsicArgumentLowering(opName); + return lowerArguments(operation, inArgs, rewriter, argLowering); + }; public: mlir::LogicalResult @@ -178,9 +202,14 @@ class HlfirReductionIntrinsicConversion : public HlfirIntrinsicConversion { opName = "sum"; } else if constexpr (std::is_same_v) { opName = "product"; + } else if constexpr (std::is_same_v) { + opName = "any"; + } else if constexpr (std::is_same_v) { + opName = "all"; } else { return mlir::failure(); } + fir::KindMapping kindMapping{rewriter.getContext()}; fir::FirOpBuilder builder{rewriter, kindMapping}; const mlir::Location &loc = operation->getLoc(); @@ -188,14 +217,15 @@ class HlfirReductionIntrinsicConversion : public HlfirIntrinsicConversion { mlir::Type i32 = builder.getI32Type(); mlir::Type logicalType = fir::LogicalType::get( builder.getContext(), builder.getKindMap().defaultLogicalKind()); - llvm::SmallVector inArgs; - inArgs.push_back({operation.getArray(), operation.getArray().getType()}); - inArgs.push_back({operation.getDim(), i32}); - inArgs.push_back({operation.getMask(), logicalType}); - auto *argLowering = fir::getIntrinsicArgumentLowering(opName); - llvm::SmallVector args = - this->lowerArguments(operation, inArgs, rewriter, argLowering); + llvm::SmallVector args; + + if constexpr (std::is_same_v || + std::is_same_v) { + args = buildNumericalArgs(operation, i32, logicalType, rewriter, opName); + } else { + args = buildLogicalArgs(operation, i32, logicalType, rewriter, opName); + } mlir::Type scalarResultType = hlfir::getFortranElementType(operation.getType()); @@ -203,8 +233,7 @@ class HlfirReductionIntrinsicConversion : public HlfirIntrinsicConversion { auto [resultExv, mustBeFreed] = fir::genIntrinsicCall(builder, loc, opName, scalarResultType, args); - this->processReturnValue(operation, resultExv, mustBeFreed, builder, - rewriter); + processReturnValue(operation, resultExv, mustBeFreed, builder, rewriter); return mlir::success(); } }; @@ -213,37 +242,9 @@ using SumOpConversion = HlfirReductionIntrinsicConversion; using ProductOpConversion = HlfirReductionIntrinsicConversion; -struct AnyOpConversion : public HlfirIntrinsicConversion { - using HlfirIntrinsicConversion::HlfirIntrinsicConversion; +using AnyOpConversion = HlfirReductionIntrinsicConversion; - mlir::LogicalResult - matchAndRewrite(hlfir::AnyOp any, - mlir::PatternRewriter &rewriter) const override { - fir::KindMapping kindMapping{rewriter.getContext()}; - fir::FirOpBuilder builder{rewriter, kindMapping}; - const mlir::Location &loc = any->getLoc(); - - mlir::Type i32 = builder.getI32Type(); - mlir::Type logicalType = fir::LogicalType::get( - builder.getContext(), builder.getKindMap().defaultLogicalKind()); - llvm::SmallVector inArgs; - inArgs.push_back({any.getMask(), logicalType}); - inArgs.push_back({any.getDim(), i32}); - - auto *argLowering = fir::getIntrinsicArgumentLowering("any"); - llvm::SmallVector args = - this->lowerArguments(any, inArgs, rewriter, argLowering); - - mlir::Type resultType = hlfir::getFortranElementType(any.getType()); - - auto [resultExv, mustBeFreed] = - fir::genIntrinsicCall(builder, loc, "any", resultType, args); - - this->processReturnValue(any, resultExv, mustBeFreed, builder, rewriter); - - return mlir::success(); - } -}; +using AllOpConversion = HlfirReductionIntrinsicConversion; struct MatmulOpConversion : public HlfirIntrinsicConversion { using HlfirIntrinsicConversion::HlfirIntrinsicConversion; @@ -354,14 +355,15 @@ class LowerHLFIRIntrinsics mlir::MLIRContext *context = &getContext(); mlir::RewritePatternSet patterns(context); patterns.insert(context); + AllOpConversion, AnyOpConversion, SumOpConversion, + ProductOpConversion, TransposeOpConversion>(context); mlir::ConversionTarget target(*context); target.addLegalDialect(); target.addIllegalOp(); + hlfir::ProductOp, hlfir::TransposeOp, hlfir::AnyOp, + hlfir::AllOp>(); target.markUnknownOpDynamicallyLegal( [](mlir::Operation *) { return true; }); if (mlir::failed( diff --git a/flang/test/HLFIR/all-lowering.fir b/flang/test/HLFIR/all-lowering.fir new file mode 100644 index 0000000000000..dfd1ace947d68 --- /dev/null +++ b/flang/test/HLFIR/all-lowering.fir @@ -0,0 +1,157 @@ +// Test hlfir.all operation lowering to fir runtime call +// RUN: fir-opt %s -lower-hlfir-intrinsics | FileCheck %s + +func.func @_QPall1(%arg0: !fir.box>> {fir.bindc_name = "a"}, %arg1: !fir.ref> {fir.bindc_name = "s"}) { + %0:2 = hlfir.declare %arg0 {uniq_name = "_QFall1Ea"} : (!fir.box>>) -> (!fir.box>>, !fir.box>>) + %1:2 = hlfir.declare %arg1 {uniq_name = "_QFall1Es"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) + %2 = hlfir.all %0#0 : (!fir.box>>) -> !fir.logical<4> + hlfir.assign %2 to %1#0 : !fir.logical<4>, !fir.ref> + return +} +// CHECK-LABEL: func.func @_QPall1( +// CHECK: %[[ARG0:.*]]: !fir.box>> {fir.bindc_name = "a"} +// CHECK: %[[ARG1:.*]]: !fir.ref> +// CHECK-DAG: %[[MASK:.*]]:2 = hlfir.declare %[[ARG0]] +// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[ARG1]] +// CHECK-DAG: %[[MASK_ARG:.*]] = fir.convert %[[MASK]]#1 : (!fir.box>>) -> !fir.box +// CHECK: %[[RET_ARG:.*]] = fir.call @_FortranAAll(%[[MASK_ARG]], %[[LOC_STR:.*]], %[[LOC_N:.*]], %[[C1:.*]]) : (!fir.box, !fir.ref, i32, i32) -> i1 +// CHECK-NEXT: %[[RET:.*]] = fir.convert %[[RET_ARG]] : (i1) -> !fir.logical<4> +// CHECK-NEXT: hlfir.assign %[[RET]] to %[[RES]]#0 : !fir.logical<4>, !fir.ref> +// CHECK-NEXT: return +// CHECK-NEXT: } + +func.func @_QPall2(%arg0: !fir.box>> {fir.bindc_name = "a"}, %arg1: !fir.box>> {fir.bindc_name = "s"}, %arg2: !fir.ref {fir.bindc_name = "d"}) { + %0:2 = hlfir.declare %arg0 {uniq_name = "_QFall2Ea"} : (!fir.box>>) -> (!fir.box>>, !fir.box>>) + %1:2 = hlfir.declare %arg2 {uniq_name = "_QFall2Ed"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %2:2 = hlfir.declare %arg1 {uniq_name = "_QFall2Es"} : (!fir.box>>) -> (!fir.box>>, !fir.box>>) + %3 = fir.load %1#0 : !fir.ref + %4 = hlfir.all %0#0 dim %3 : (!fir.box>>, i32) -> !hlfir.expr> + hlfir.assign %4 to %2#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %4 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPall2( +// CHECK: %[[ARG0:.*]]: !fir.box>> +// CHECK: %[[ARG1:.*]]: !fir.box>> +// CHECK: %[[ARG2:.*]]: !fir.ref +// CHECK-DAG: %[[MASK:.*]]:2 = hlfir.declare %[[ARG0]] +// CHECK-DAG: %[[DIM_VAR:.*]]:2 = hlfir.declare %[[ARG2]] +// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[ARG1]] + +// CHECK-DAG: %[[RET_BOX:.*]] = fir.alloca !fir.box>>> +// CHECK-DAG: %[[RET_ADDR:.*]] = fir.zero_bits !fir.heap>> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[RET_SHAPE:.*]] = fir.shape %[[C0]] : (index) -> !fir.shape<1> +// CHECK-DAG: %[[RET_EMBOX:.*]] = fir.embox %[[RET_ADDR]](%[[RET_SHAPE]]) +// CHECK-DAG: fir.store %[[RET_EMBOX]] to %[[RET_BOX]] + +// CHECK-DAG: %[[DIM:.*]] = fir.load %[[DIM_VAR]]#0 : !fir.ref +// CHECK-DAG: %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]] +// CHECK-DAG: %[[MASK_ARG:.*]] = fir.convert %[[MASK]]#1 + +// CHECK: %[[NONE:.*]] = fir.call @_FortranAAllDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref>, !fir.box, i32, !fir.ref, i32) -> none +// CHECK: %[[RET:.*]] = fir.load %[[RET_BOX]] +// CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]] +// CHECK-NEXT: %[[ADDR:.*]] = fir.box_addr %[[RET]] +// CHECK-NEXT: %[[SHIFT:.*]] = fir.shape_shift %[[BOX_DIMS]]#0, %[[BOX_DIMS]]#1 +// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]](%[[SHIFT]]) {uniq_name = ".tmp.intrinsic_result"} +// CHECK: %[[TRUE:.*]] = arith.constant true +// CHECK: %[[EXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.box>>, i1) -> !hlfir.expr> +// CHECK: hlfir.assign %[[EXPR]] to %[[RES]]#0 +// CHECK: hlfir.destroy %[[EXPR]] +// CHECK-NEXT: return +// CHECK-NEXT: } + +func.func @_QPall3(%arg0: !fir.ref>> {fir.bindc_name = "s"}) { + %0 = fir.address_of(@_QFall3Ea) : !fir.ref>> + %c2 = arith.constant 2 : index + %c2_0 = arith.constant 2 : index + %1 = fir.shape %c2, %c2_0 : (index, index) -> !fir.shape<2> + %2:2 = hlfir.declare %0(%1) {uniq_name = "_QFall3Ea"} : (!fir.ref>>, !fir.shape<2>) -> (!fir.ref>>, !fir.ref>>) + %c2_1 = arith.constant 2 : index + %3 = fir.shape %c2_1 : (index) -> !fir.shape<1> + %4:2 = hlfir.declare %arg0(%3) {uniq_name = "_QFall3Es"} : (!fir.ref>>, !fir.shape<1>) -> (!fir.ref>>, !fir.ref>>) + %c1_i32 = arith.constant 1 : i32 + %5 = hlfir.all %2#0 dim %c1_i32 : (!fir.ref>>, i32) -> !hlfir.expr<2x!fir.logical<4>> + hlfir.assign %5 to %4#0 : !hlfir.expr<2x!fir.logical<4>>, !fir.ref>> + hlfir.destroy %5 : !hlfir.expr<2x!fir.logical<4>> + return +} +// CHECK-LABEL: func.func @_QPall3( +// CHECK: %[[ARG0:.*]]: !fir.ref>> +// CHECK-DAG: %[[RET_BOX:.*]] = fir.alloca !fir.box>>> +// CHECK-DAG: %[[RET_ADDR:.*]] = fir.zero_bits !fir.heap>> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[RET_SHAPE:.*]] = fir.shape %[[C0]] : (index) -> !fir.shape<1> +// CHECK-DAG: %[[RET_EMBOX:.*]] = fir.embox %[[RET_ADDR]](%[[RET_SHAPE]]) +// CHECK-DAG: fir.store %[[RET_EMBOX]] to %[[RET_BOX]] +// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[ARG0]](%[[RES_SHAPE:.*]]) + +// CHECK-DAG: %[[MASK_ADDR:.*]] = fir.address_of +// CHECK-DAG: %[[MASK_VAR:.*]]:2 = hlfir.declare %[[MASK_ADDR]](%[[MASK_SHAPE:.*]]) +// CHECK-DAG: %[[MASK_BOX:.*]] = fir.embox %[[MASK_VAR]]#1(%[[MASK_SHAPE:.*]]) + +// CHECK-DAG: %[[DIM:.*]] = arith.constant 1 : i32 + +// CHECK-DAG: %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]] +// CHECK-DAG: %[[MASK_ARG:.*]] = fir.convert %[[MASK_BOX]] : (!fir.box>>) -> !fir.box +// CHECK: %[[NONE:.*]] = fir.call @_FortranAAllDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) +// CHECK: %[[RET:.*]] = fir.load %[[RET_BOX]] +// CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]] +// CHECK-NEXT: %[[ADDR:.*]] = fir.box_addr %[[RET]] +// CHECK-NEXT: %[[SHIFT:.*]] = fir.shape_shift %[[BOX_DIMS]]#0, %[[BOX_DIMS]]#1 +// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]](%[[SHIFT]]) {uniq_name = ".tmp.intrinsic_result"} +// CHECK: %[[TRUE:.*]] = arith.constant true +// CHECK: %[[EXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.box>>, i1) -> !hlfir.expr> +// CHECK: hlfir.assign %[[EXPR]] to %[[RES]] +// CHECK: hlfir.destroy %[[EXPR]] +// CHECK-NEXT: return +// CHECK-NEXT: } + +func.func @_QPall4(%arg0: !fir.box>> {fir.bindc_name = "a"}, %arg1: !fir.box>> {fir.bindc_name = "s"}, %arg2: !fir.ref>> {fir.bindc_name = "d"}) { + %0:2 = hlfir.declare %arg0 {uniq_name = "_QFall4Ea"} : (!fir.box>>) -> (!fir.box>>, !fir.box>>) + %1:2 = hlfir.declare %arg2 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFall4Ed"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) + %2:2 = hlfir.declare %arg1 {uniq_name = "_QFall4Es"} : (!fir.box>>) -> (!fir.box>>, !fir.box>>) + %3 = fir.load %1#0 : !fir.ref>> + %4 = fir.box_addr %3 : (!fir.box>) -> !fir.ptr + %5 = fir.load %4 : !fir.ptr + %6 = hlfir.no_reassoc %5 : i32 + %7 = hlfir.all %0#0 dim %6 : (!fir.box>>, i32) -> !hlfir.expr> + hlfir.assign %7 to %2#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %7 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPall4( +// CHECK: %[[ARG0:.*]]: !fir.box>> +// CHECK: %[[ARG1:.*]]: !fir.box>> +// CHECK: %[[ARG2:.*]]: !fir.ref>> +// CHECK-DAG: %[[MASK:.*]]:2 = hlfir.declare %[[ARG0]] +// CHECK-DAG: %[[DIM_ARG:.*]]:2 = hlfir.declare %[[ARG2]] +// CHECK-DAG: %[[RES:.*]]:2 = hlfir.declare %[[ARG1]] + +// CHECK-DAG: %[[RET_BOX:.*]] = fir.alloca !fir.box>>> +// CHECK-DAG: %[[DIM_PTR:.*]] = fir.load %[[DIM_ARG]]#0 : !fir.ref>> +// CHECK-DAG: %[[DIM_ADDR:.*]] = fir.box_addr %[[DIM_PTR]] +// CHECK-DAG: %[[DIM_VAR:.*]] = fir.load %[[DIM_ADDR]] +// CHECK-DAG: %[[DIM:.*]] = hlfir.no_reassoc %[[DIM_VAR]] + +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[RET_ADDR:.*]] = fir.zero_bits !fir.heap>> +// CHECK-DAG: %[[RET_SHAPE:.*]] = fir.shape %[[C0]] : (index) -> !fir.shape<1> +// CHECK-DAG: %[[RET_EMBOX:.*]] = fir.embox %[[RET_ADDR]](%[[RET_SHAPE]]) +// CHECK-DAG: fir.store %[[RET_EMBOX]] to %[[RET_BOX]] +// CHECK-DAG: %[[RET_ARG:.*]] = fir.convert %[[RET_BOX]] +// CHECK-DAG: %[[MASK_ARG:.*]] = fir.convert %[[MASK]]#1 + +// CHECK: %[[NONE:.*]] = fir.call @_FortranAAllDim(%[[RET_ARG]], %[[MASK_ARG]], %[[DIM]], %[[LOC_STR:.*]], %[[LOC_N:.*]]) : (!fir.ref>, !fir.box, i32, !fir.ref, i32) -> none +// CHECK: %[[RET:.*]] = fir.load %[[RET_BOX]] +// CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[RET]] +// CHECK-NEXT: %[[ADDR:.*]] = fir.box_addr %[[RET]] +// CHECK-NEXT: %[[SHIFT:.*]] = fir.shape_shift %[[BOX_DIMS]]#0, %[[BOX_DIMS]]#1 +// CHECK-NEXT: %[[TMP:.*]]:2 = hlfir.declare %[[ADDR]](%[[SHIFT]]) {uniq_name = ".tmp.intrinsic_result"} +// CHECK: %[[TRUE:.*]] = arith.constant true +// CHECK: %[[EXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move %[[TRUE]] : (!fir.box>>, i1) -> !hlfir.expr> +// CHECK: hlfir.assign %[[EXPR]] to %[[RES]] +// CHECK: hlfir.destroy %[[EXPR]] +// CHECK-NEXT: return +// CHECK-NEXT: } \ No newline at end of file From 1a28b9bce75d184a74012fdf9f6e0fcbb7fd0e1c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 30 May 2023 15:53:26 +0100 Subject: [PATCH 088/704] [VPlan] Handle invariant GEPs in isUniformAfterVectorization. This fixes a crash caused by legal treating a scalable GEP as invariant, but isUniformAfterVectorization does not handle GEPs. Partially fixes https://github.com/llvm/llvm-project/issues/60831. Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D144434 --- llvm/lib/Transforms/Vectorize/VPlan.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 4ded278e8a582..055da0347d068 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2979,6 +2979,8 @@ inline bool isUniformAfterVectorization(VPValue *VPV) { assert(Def && "Must have definition for value defined inside vector region"); if (auto Rep = dyn_cast(Def)) return Rep->isUniform(); + if (auto *GEP = dyn_cast(Def)) + return all_of(GEP->operands(), isUniformAfterVectorization); return false; } } // end namespace vputils From f64f760e2d20340ef7d1a95c8598a90e42ac31e7 Mon Sep 17 00:00:00 2001 From: "Manna, Soumi" Date: Tue, 30 May 2023 07:58:02 -0700 Subject: [PATCH 089/704] [NFC][CLANG] Fix nullptr dereference issue in Type::getSveEltType() This patch uses castAs instead of getAs which will assert if the type doesn't match in clang::Type::getSveEltType(clang::ASTContext const &) Reviewed By: erichkeane Differential Revision: https://reviews.llvm.org/D151525 --- clang/lib/AST/Type.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 51e206d8c4636..508965fc38e55 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2434,7 +2434,7 @@ bool Type::isVLSTBuiltinType() const { QualType Type::getSveEltType(const ASTContext &Ctx) const { assert(isVLSTBuiltinType() && "unsupported type!"); - const BuiltinType *BTy = getAs(); + const BuiltinType *BTy = castAs(); if (BTy->getKind() == BuiltinType::SveBool) // Represent predicates as i8 rather than i1 to avoid any layout issues. // The type is bitcasted to a scalable predicate type when casting between From 68ae0d7803e43146b28f94f62357226047af7d9a Mon Sep 17 00:00:00 2001 From: Alex Zinenko Date: Thu, 25 May 2023 20:26:27 +0000 Subject: [PATCH 090/704] [mlir] add initial chapters of the transform dialect tutorial The transform dialect has been around for a while and is sufficiently stable at this point. Add the first three chapters of the tutorial describing its usage and extension. Reviewed By: springerm Differential Revision: https://reviews.llvm.org/D151491 --- mlir/docs/Dialects/Transform.md | 16 +- mlir/docs/Tutorials/_index.md | 1 + mlir/docs/Tutorials/transform/Ch0.md | 314 +++++++++++++++ mlir/docs/Tutorials/transform/Ch1.md | 364 ++++++++++++++++++ mlir/docs/Tutorials/transform/Ch2.md | 327 ++++++++++++++++ mlir/docs/Tutorials/transform/Ch3.md | 283 ++++++++++++++ mlir/docs/Tutorials/transform/_index.md | 32 ++ mlir/examples/CMakeLists.txt | 1 + mlir/examples/transform/CMakeLists.txt | 4 + mlir/examples/transform/Ch2/CMakeLists.txt | 20 + .../transform/Ch2/include/CMakeLists.txt | 12 + .../transform/Ch2/include/MyExtension.h | 22 ++ .../transform/Ch2/include/MyExtension.td | 56 +++ .../examples/transform/Ch2/lib/CMakeLists.txt | 21 + .../transform/Ch2/lib/MyExtension.cpp | 132 +++++++ .../Ch2/transform-opt/transform-opt.cpp | 61 +++ mlir/examples/transform/Ch3/CMakeLists.txt | 20 + .../transform/Ch3/include/CMakeLists.txt | 19 + .../transform/Ch3/include/MyExtension.h | 32 ++ .../transform/Ch3/include/MyExtension.td | 98 +++++ .../transform/Ch3/include/MyExtensionTypes.td | 34 ++ .../examples/transform/Ch3/lib/CMakeLists.txt | 21 + .../transform/Ch3/lib/MyExtension.cpp | 218 +++++++++++ .../Ch3/transform-opt/transform-opt.cpp | 61 +++ mlir/examples/transform/README.md | 4 + mlir/test/CMakeLists.txt | 2 + .../transform/Ch1/invalidation-1.mlir | 98 +++++ .../transform/Ch1/invalidation-2.mlir | 102 +++++ .../test/Examples/transform/Ch1/sequence.mlir | 111 ++++++ mlir/test/Examples/transform/Ch2/invalid.mlir | 11 + mlir/test/Examples/transform/Ch2/ops.mlir | 26 ++ .../test/Examples/transform/Ch2/sequence.mlir | 110 ++++++ mlir/test/Examples/transform/Ch3/invalid.mlir | 10 + mlir/test/Examples/transform/Ch3/ops.mlir | 46 +++ .../test/Examples/transform/Ch3/sequence.mlir | 110 ++++++ mlir/test/lit.cfg.py | 2 + 36 files changed, 2786 insertions(+), 15 deletions(-) create mode 100644 mlir/docs/Tutorials/transform/Ch0.md create mode 100644 mlir/docs/Tutorials/transform/Ch1.md create mode 100644 mlir/docs/Tutorials/transform/Ch2.md create mode 100644 mlir/docs/Tutorials/transform/Ch3.md create mode 100644 mlir/docs/Tutorials/transform/_index.md create mode 100644 mlir/examples/transform/CMakeLists.txt create mode 100644 mlir/examples/transform/Ch2/CMakeLists.txt create mode 100644 mlir/examples/transform/Ch2/include/CMakeLists.txt create mode 100644 mlir/examples/transform/Ch2/include/MyExtension.h create mode 100644 mlir/examples/transform/Ch2/include/MyExtension.td create mode 100644 mlir/examples/transform/Ch2/lib/CMakeLists.txt create mode 100644 mlir/examples/transform/Ch2/lib/MyExtension.cpp create mode 100644 mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp create mode 100644 mlir/examples/transform/Ch3/CMakeLists.txt create mode 100644 mlir/examples/transform/Ch3/include/CMakeLists.txt create mode 100644 mlir/examples/transform/Ch3/include/MyExtension.h create mode 100644 mlir/examples/transform/Ch3/include/MyExtension.td create mode 100644 mlir/examples/transform/Ch3/include/MyExtensionTypes.td create mode 100644 mlir/examples/transform/Ch3/lib/CMakeLists.txt create mode 100644 mlir/examples/transform/Ch3/lib/MyExtension.cpp create mode 100644 mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp create mode 100644 mlir/examples/transform/README.md create mode 100644 mlir/test/Examples/transform/Ch1/invalidation-1.mlir create mode 100644 mlir/test/Examples/transform/Ch1/invalidation-2.mlir create mode 100644 mlir/test/Examples/transform/Ch1/sequence.mlir create mode 100644 mlir/test/Examples/transform/Ch2/invalid.mlir create mode 100644 mlir/test/Examples/transform/Ch2/ops.mlir create mode 100644 mlir/test/Examples/transform/Ch2/sequence.mlir create mode 100644 mlir/test/Examples/transform/Ch3/invalid.mlir create mode 100644 mlir/test/Examples/transform/Ch3/ops.mlir create mode 100644 mlir/test/Examples/transform/Ch3/sequence.mlir diff --git a/mlir/docs/Dialects/Transform.md b/mlir/docs/Dialects/Transform.md index b34bf76e76a80..f49444a801de3 100644 --- a/mlir/docs/Dialects/Transform.md +++ b/mlir/docs/Dialects/Transform.md @@ -1,23 +1,9 @@ # Transform Dialect -Fine-grain transformation control dialect. +Fine-grain transformation control dialect. See [../Tutorials/transform](tutorial) for more introductory information. [TOC] -## Disclaimer - -**This dialect is actively developed and may change frequently.** - -To decrease the maintenance burden and churn, please post a description of -the intended use case on the MLIR forum. A few in-tree use cases are -currently supported: - - - high-level transformations on "structured ops" (i.e. ops that operate on - chunks of data in a way that can be decomposed into operations on - smaller chunks of data and control flow) in Linalg, Tensor and Vector - dialects; - - loop transformations in the SCF dialect. - ## Overview This dialect provides operations that can be used to control transformation diff --git a/mlir/docs/Tutorials/_index.md b/mlir/docs/Tutorials/_index.md index 13186df19d7ba..5834e86478769 100644 --- a/mlir/docs/Tutorials/_index.md +++ b/mlir/docs/Tutorials/_index.md @@ -2,3 +2,4 @@ This section contains multiple MLIR tutorials. See [Toy tutorial](toy) for an introduction to using MLIR infrastructure. +See [Transform dialect tutorial](transform) for an introduction to using and extending of MLIR's Transform dialect. diff --git a/mlir/docs/Tutorials/transform/Ch0.md b/mlir/docs/Tutorials/transform/Ch0.md new file mode 100644 index 0000000000000..eb3272cced288 --- /dev/null +++ b/mlir/docs/Tutorials/transform/Ch0.md @@ -0,0 +1,314 @@ +# Chapter 0: A Primer on “Structured” Linalg Operations + +Before starting the tutorial on the Transform dialect, let us take a brief look at the concept of Structured operations and its implementation in the Linalg dialect. Note that the Transform dialect does not require Structured operations and vice versa. The two co-evolved at the beginning of the Transform dialect, which makes the subset of transformations for Structured operations the most mature and most suitable for the tutorial. If you are already familiar with this concept, skip to Chapter 1. + +Structured code generation intends to preserve the structure of the computation for as long as necessary to enable transformations, up to and including the design of IR abstractions that support specific transformations. + +## Uniform Elementwise Extension + +Consider a simple scalar arithmetic addition operation in MLIR, which maps directly to a machine instruction on most architectures that support floating point operations: + + +```mlir +%2 = arith.addf %0, %1 : f32 +``` + +This operation can be easily extended to uniformly apply to elements of a 1D vector, which is also often available as an instruction of vector machines: + +```mlir +%2 = arith.addf %0, %1 : vector<8xf32> +``` + +Only a few modern instruction sets offer instructions for two- or more-dimensional vectors. In MLIR, however, it is possible to transparently extend the uniform elementwise application to vectors of arbitrary rank. + +```mlir +%2 = arith.addf %0, %1 : vector<8x4xf32> +%5 = arith.addf %3, %4 : vector<2x2x2x2x2x2x2xf32> +``` + +As you can notice, MLIR’s arithmetic operations on vectors preserve the structure of uniform elementwise application. This structure can be leveraged by the compiler, for example, to produce smaller-rank operations available on the target or to fuse multiplication and addition when such a fused instruction is available (which becomes complicated when there are a hundred of multiplications followed by a hundred of additions). + +## Reduction + +Sometimes it is necessary to add elements of a vector to obtain a scalar. Some platforms provide specific instructions for this operation, some others provide ones that can be combined to achieve the desired effect, such as addition of adjacent elements and element shuffle. + +The Vector dialect in MLIR defines an operation to explicitly denote a within-vector reduction: + +```mlir +%0 = vector.reduction , %0 : vector<8xf32> into f32 +``` + +When no support is available, such an operation can be transformed into a loop: + +```mlir +%c0 = arith.constant 0 : index +%c1 = arith.constant 1 : index +%c8 = arith.constant 8 : index +%init = arith.constant 0.0 : f32 +%result = scf.for %i = %c0 to %c8 step %c1 iter_args(%partial = %init) -> (f32) { + %element = vector.extractelement %0[%i : index] : vector<8xf32> + %updated = arith.addf %partial, %element : f32 + scf.yield %updated : f32 +} +``` + +Even when special instructions are available, it may still be desirable to use the loop form (with unrolling), depending on instruction latency and register pressure. Preserving the structure of the operation as a single reduction gives the compiler an understanding that a within-vector reduction is performed and, therefore, a choice in implementation. + +## Contraction + +Contraction is a generalization of reduction that multiplies elements from two vectors before adding them up. A simple “add” reduction can be thought of as a contraction where one of the vectors contains `1.0`, the neutral element of multiplication. Contractions offer even more flexibility to the compiler, and are represented as by a dedicated operation in MLIR: + +```mlir +// Neutral initializer for the addition. +%init = arith.constant 0.0 : f32 +// Neutral element of multiplication. +%ones = arith.constant dense<1.0> : vector<8xf32> +// Actual contraction. +%result = vector.contract { + indexing_maps = [affine_map<(i) -> (i)>, + affine_map<(i) -> (i)>, + affine_map<(i) -> ()>], + iterator_types = ["reduction"] +} %0, %ones, %init : vector<8xf32>, vector<8xf32> into f32 +``` + +Note the `affine_map` expressions indicating how vector elements are indexed. Their meaning is perhaps most evident when writing the loop form pseudo-code equivalent to this contraction: + +```mlir +for i in 0 to 8: + init += p0[i] * ones[i] +``` + +where both `%0` and `%ones` use the loop induction variable `i`, as noted on the right-hand side of the corresponding affine map, `(i) -> (i)`, and the `%init` does not, as reflected on the right-hand side of its affine map, `(i) -> ()`. + +Similarly to uniform elementwise extension, MLIR vector contractions are not limited to 1D cases. In the 2D+ case, one can additionally specify which of the vector dimensions are being reduced and which ones are being preserved. This can be achieved by using the `iterator_types` attribute that specifies, for each dimension, whether it is being reduced (`"reduction"`) or preserved (`"parallel"`). Consider the following 3D contraction that encodes a matrix-matrix multiplication: + +```mlir +%result = vector.contract { + indexing_maps = [affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)>], + iterator_types = ["parallel", "parallel", "reduction"] +} %lhs, %rhs, %init: vector<8x10xf32>, vector<10x16xf32> into vector<8x16xf32> +``` + +Looking at the indexing maps, it is easy to recognize the loop form: + +```mlir +for i in 0 to 8: + for j in 0 to 16: + for k in 0 to 10: + init[i, j] += lhs[i, k] * rhs[k, j] +``` + +Preserving this higher-level structure of a contraction makes it significantly easier for the compiler to recognize operations such as matrix multiplications and dot products and gives it freedom to produce lower-level operations that leverage most advanced instructions or even pre-generated microkernels. + +## Generic Operation on Memory + +Until now, we have been considering operations on vectors stored in virtual registers. A similar contraction abstraction can be defined in memory: + +```mlir +linalg.generic { + indexing_maps = [affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)>], + iterator_types = ["parallel", "parallel", "reduction"] +} ins(%lhs, %rhs : memref<8x10xf32>, memref<10x16xf32>) + outs(%init : memref<8x16xf32>) { +^bb0(%lhs_one: f32, %rhs_one: f32, %init_one: f32): + %0 = arith.mulf %lhs_one, %rhs_one : f32 + %1 = arith.addf %init_one, %0 : f32 + linalg.yield %1 : f32 +} +``` + +This looks more complicated, so let us unpack. The `indexing_maps` and `iterator_types` are _exactly_ the same as we have seen above for vector contractions. The operands are now split into two lists: + + +* `in` operands containing the buffers that are being only read by the operation; +* `out` operands that are being read and updated by the operation. + +This separation wasn’t necessary on vectors because, in MLIR, vectors are read-only (SSA or functional form) and operations mutating a vector are in fact producing a new one instead. + +Furthermore, the operation now contains a region that explicitly specifies the multiplication and the addition operations that were implicit in the contraction. Block arguments in the region correspond to individual elements read from the buffer: the first two correspond to the `in` operands and the last one corresponds to the `out` operand. The value yielded from the region is “written” to the `out` operand and is available as the last block argument for future executions of the region. Note that the order in which the region is executed for various tuples of elements read from the buffers is not specified, and the write to the `out` buffer is written as a whole at the end of the operation. + +## “Loop” Fusion + +Since the region of the `generic` operation can contain arbitrarily many operations, we can use it to express “fusion” of the implicit loops by simply having more operations chained in the region. For example, the common machine learning rectified linear unit layer (ReLU), which can be defined as `relu(x) = max(0, x)`, can be defined be expressed using the “compare-and-select” idiom in one `generic` operation, without the temporary buffer for the comparison result and without repeating the outer operation: + +```mlir +linalg.generic { + indexing_maps [affine_map<(i) -> (i)>, affine_map<(i) -> (i)>], + iterator_types = ["parallel"] +} ins(%in : memref) outs(%out : memref) { +^bb0(%in_one : f32, %out_one : f32): + %c0 = arith.constant 0.0 : f32 + %0 = arith.cmpf ogt %in_one, %c0 : f32 + %1 = arith.select %0, %in_one, %c0 : f32 + linalg.yield %1 : f32 +} +``` + +Such operations can be converted to loops or lowered into vector forms after splitting into multiple operations, each of which maps to a Vector dialect primitive. This modeling, again, gives the compiler more choice in selecting the code generation strategy. + +## Generic Operation on Tensors + +Let us take one last step up on the abstraction ladder. MLIR provides a tensor abstraction that makes it easy for the compiler to reason about multidimensional yet regular data without having to solve complex problems such as alias analysis and dependency satisfaction, which would be necessary on multidimensional buffers. The tensor abstraction is very similar to the vector abstraction (major differences include the availability of unranked tensors, tensor layouts, and vectors being usable as elemental types of tensors but not of other vectors). Tensors are read-only, and operations updating a tensor produce a new tensor. + +The `generic` operation from above can lifted to operate on tensors instead of buffers: + +```mlir +%result = linalg.generic { + indexing_maps = [affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)>], + iterator_types = ["parallel", "parallel", "reduction"] +} ins(%lhs, %rhs : tensor<8x10xf32>,tensor<10x16xf32>) + outs(%init :tensor<8x16xf32>) { +^bb0(%lhs_one: f32, %rhs_one: f32, %init_one: f32): + %0 = arith.mulf %lhs_one, %rhs_one : f32 + %1 = arith.addf %init_one, %0 : f32 + linalg.yield %1 : f32 +} -> tensor<8x16xf32> +``` + +As you can notice, most components of this operation remain identical to its buffer version. It has been specifically designed this way. The main difference, beside the operand types, is that the operation now produces a new result instead of updating the `out` buffer. The `out` operand is used only as the initialization value. + +If the `generic` operation had existed on vectors, it would have had the exact same structure. + +## Tiling and Loop Materialization + +At this level of abstraction, it becomes easy for the compiler to perform more advanced transformations usually required for high-performance code generation, such as [tiling](https://en.wikipedia.org/wiki/Loop_nest_optimization). Tiling, in general, can be seen as partitioning the iteration space into smaller parts, or tiles, so that the data required by each part fits into a level of cache for example. The order in which tiles are executed must preserve the original data dependencies. + +In the case of `generic` operations, the iteration space is implicit and is defined by the shape of the operands. Therefore, a tile can be expressed by performing the _same_ operation on a subset (slice) of the original data. Since the order in which the body of `generic` is applied to different tuples of the input elements is unspecified, tiles can be executed in any order, without the need for dependence analysis. In order to control the execution of different tiles, the implementation of tiling produces loops. Thus tiling `generic` operations can also be seen as materializing the loops that have been implicit until now. + +For example, tiling the matrix multiplication presented above with tile sizes `(2, 8)`, we obtain a loop nest around a `generic` expressing the same operation on a `2x8` tensor. + +```mlir +// A special "multi-for" loop that supports tensor-insertion semantics +// as opposed to implicit updates. The resulting 8x16 tensor will be produced +// by this loop. +// The trip count of iterators is computed dividing the original tensor size, +// 8x16, by the tile size, 2x8, to obtain 4x2. +// When tensor sizes are dynamic, the trip count computation is emitted as IR +// and is being computed at runtime. +%0 = scf.forall (%i, %j) in (4, 2) + shared_outs(%shared = %init) -> (tensor<8x16xf32>) { + + // Scale the loop induction variables by the tile sizes. + %3 = affine.apply affine_map<(d0) -> (d0 * 2)>(%i) + %4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%j) + + // Take slices of inputs and outputs. Only the "i" and "j" dimensions are sliced. + %lhs_slice = tensor.extract_slice %lhs[%3, 0] [2, 10] [1, 1] + : tensor<8x10xf32> to tensor<2x10xf32> + %rhs_slice = tensor.extract_slice %rhs[0, %4] [10, 8] [1, 1] + : tensor<10x16xf32> to tensor<10x8xf32> + %result_slice = tensor.extract_slice %shared[%3, %4] [2, 8] [1, 1] + : tensor<8x16xf32> to tensor<2x8xf32> + + // This is exactly the same operation as before, but now operating on smaller + // slices of data. + %partial = linalg.generic { + indexing_maps = [affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)>], + iterator_types = ["parallel", "parallel", "reduction"] + } ins(%lhs_slice, %rhs_slice : tensor<2x10xf32>, tensor<10x8xf32>) + outs(%result_slice : tensor<2x8xf32>) -> tensor<2x8xf32> { + ^bb0(%lhs_one: f32, %rhs_one: f32, %init_one: f32): + %0 = arith.mulf %lhs_one, %rhs_one : f32 + %1 = arith.addf %init_one, %0 : f32 + linalg.yield %1 : f32 + } : tensor<2x8xf32> + + // Terminator for the loop with tensor-insertion semantics. Inserts a slice + // into a larger tensor, potentially in parallel. + scf.forall.in_parallel { + tensor.parallel_insert_slice %partial into %shared[%3, %4] [2, 8] [1, 1] + : tensor<2x8xf32> into tensor<8x16xf32> + } +} +``` + +## Producer/Consumer Fusion and Rematerialization + +After materializing loops with tiling, another key code generation transformation becomes simple – fusion. Unlike loop fusion, the Structured operations approach allows for producer/consumer fusion even when the (implicit) iteration spaces of the operations do not match. Given an high-level structured operation on tensors, such as `linalg.generic`, one can follow use-def chains to identify: + +1. the subset (slice) of the operand that is used by the tile, and +2. the tensor-level structured operation producing the whole tensor that is being sliced. + +By inverting the `indexing_map` and applying it to the set of elements accessed through the slice, we can compute the part of the iteration space of the operation defining the full tensor necessary to compute the tile. Thus fusion boils down to replacing the `tensor.extract_slice` operation with the tile of the `linalg.generic` producing the original operand. + +Let us assume that the matrix multiplication operation is followed by another operation that multiplies each element of the resulting matrix with itself. This trailing elementwise operation has a 2D iteration space, unlike the 3D one in matrix multiplication. Nevertheless, it is possible to tile the trailing operation and then fuse the producer of its operand, the matmul, into the loop generated by tiling. The untiled dimension will be used in its entirety. + + +```mlir +// Same loop as before. +%0 = scf.forall (%i, %j) in (4, 2) + shared_outs(%shared = %init) + -> (tensor<8x16xf32>, tensor<8x16xf32>) { + // Scale the loop induction variables by the tile sizes. + %1 = affine.apply affine_map<(d0) -> (d0 * 2)>(%i) + %2 = affine.apply affine_map<(d0) -> (d0 * 8)>(%j) + + // Take slices of inputs and outputs. Only the "i" and "j" dimensions are sliced. + %lhs_slice = tensor.extract_slice %lhs[%1, 0] [2, 10] [1, 1] + : tensor<8x10xf32> to tensor<2x10xf32> + %rhs_slice = tensor.extract_slice %rhs[0, %2] [10, 8] [1, 1] + : tensor<10x16xf32> to tensor<10x8xf32> + %result_slice = tensor.extract_slice %result[%1, %2] [2, 8] [1, 1] + : tensor<8x16xf32> to tensor<2x8xf32> + + // This is exactly the same matmul slice as before. It replaces the slice + // extraction for the generic operation below. + %partial = linalg.generic { + indexing_maps = [affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)>], + iterator_types = ["parallel", "parallel", "reduction"] + } ins(%lhs_slice, %rhs_slice : tensor<2x10xf32>, tensor<10x8xf32>) + outs(%result_slice : tensor<2x8xf32>) { + ^bb0(%lhs_one: f32, %rhs_one: f32, %init_one: f32): + %5 = arith.mulf %lhs_one, %rhs_one : f32 + %6 = arith.addf %init_one, %5 : f32 + linalg.yield %6 : f32 + } -> tensor<2x8xf32> + + // Take the slice of the final result. Note that we don't need to take + // the slice of the operand because the matmul operation above computes + // it in-place. + %shared_slice = tensor.extract_slice %shared[%1, %2] [2, 8] [1, 1] + : tensor<8x16xf32> to tensor<2x8xf32> + + // The elementwise operation that we tiled. + %elemwise = linalg.generic { + indexing_maps = [affine_map<(i, j) -> (i, j)>, + affine_map<(i, j) -> (i, j)>], + iterator_types = ["parallel", "parallel"] + } ins(%partial : tensor<2x8xf32>) + outs(%shared_slice : tensor<2x8xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = arith.mulf %in, %in : f32 + linalg.yield %5 : f32 + } -> tensor<2x8xf32> + + // Terminator for the loop with tensor-insertion semantics. Inserts a slice + // into a larger tensor, potentially in parallel. + scf.forall.in_parallel { + tensor.parallel_insert_slice %elemwise into %shared[%1, %2] [2, 8] [1, 1] + : tensor<2x8xf32> into tensor<8x16xf32> + } +} +``` + +This process may result in some elements in the operand tensors being (re)computed on every iteration of the loop. This is also known as _rematerialization_ and expresses the tradeoff between performing redundant computations or storing their result in (slow) memory. + +## Shorthand “Named” Forms of Linalg Ops + +Linalg provides a set of predefined operations for common cases such as matrix multiplication, dot product, convolution, etc. These operations are equivalent to the `generic` ones but spare the need to spell out the access patterns and the bodies. For example, matrix multiplication is simply: + +```mlir +%matmul = linalg.matmul ins(%lhs, %rhs: tensor<8x10xf32>, tensor<10x16xf32>) + outs(%init: tensor<8x10xf32xf32>) -> tensor<8x16xf32> +``` diff --git a/mlir/docs/Tutorials/transform/Ch1.md b/mlir/docs/Tutorials/transform/Ch1.md new file mode 100644 index 0000000000000..988117a995724 --- /dev/null +++ b/mlir/docs/Tutorials/transform/Ch1.md @@ -0,0 +1,364 @@ +# Chapter 1: Combining Existing Transformations + +## Introduction + +The Transform dialect allows one to precisely target transformations at specific operations in the IR and to chain them, that is to apply a transformation to operations produced by the previous transformation. To achieve this, transformations are expressed as other operations in the IR. We call these the IR containing these operations transform IR. And we call the IR that is being transformed payload IR. + +Transform IR operations operate on values that may be associated with payload IR operations, values or attributes. We call the first two kinds of values operation and value handles, respectively. We call the last kind of values parameters. + +The application of transform IR always starts from one top-level operation. In the C++ API, this operation is passed to the `applyTransforms` function. This top-level operation specifies if other transformations should be performed and how. The most common top-level operation merely applies other transform operations listed in its body one after the other. + +Let us illustrate this with a simple sequence of transformations on the common “fully connected + bias + ReLU” ML layer, which boils down to performing a matrix multiplication, followed by an (elementwise) matrix addition and taking an elementwise maximum with 0. This can be expressed using the following IR: + +```mlir +func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, + %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>) + -> tensor<512x512xf32> { + // Matrix-matrix multiplication. + %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise addition. + %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise max with 0 (ReLU). + %c0f = arith.constant 0.0 : f32 + %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%biased, %c0f : tensor<512x512xf32>, f32) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + func.return %relued : tensor<512x512xf32> +} +``` + +## Top-Level Sequence Operation + +For performance reasons, we would like to tile and fuse these operations to exploit cache locality. This is a sequence of transformations that need to be performed one after another, so we naturally start with the corresponding top-level transform operation. + +```mlir +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + transform.yield +} +``` + +There are several aspects worth noticing in this operation. + +The first entry block argument is mandatory for top-level transform operations and is associated with the top-level payload operation that sequence is applied to, for example, a module or a function. This operation is specified when calling `applyTransforms`. + +The remaining entry block arguments are optional and can be associated with payload attributes, operations or values that are useful in the sequence. These are also specified when calling `applyTransforms`. In our case, we are interested in the matrix multiplication and elementwise operations that we are going to tile and fuse. + +All value handles have Transform dialect types. These types specify certain properties of the payload IR entities associated with them. In this example, `transform.any_op` indicates that the handle is associated with arbitrary payload operations. On the contrary, `transform.op<"X">` indicates that the handle is associated _only_ with payload operations of kind `X`. These constraints are verified when the handle/payload association is created. For entry block arguments of top-level transform operations, this happens early in the `applyTransforms` function. If the constraints are not satisfied, the transform application fails and produces diagnostics for the user. + +## Failure Propagation + +Speaking about diagnostics, the `sequence` operation itself has a mandatory attribute specifying the failure propagation mode. There are two options: + +* “propagate” makes the sequence transformation fail if any of the nested transformation fails; +* “suppress” makes the sequence succeed even if one of the nested transformations fails, but without attempting to perform the transformations following the failed one in the sequence. + +This latter allows the transformation to continue despite (recoverable) errors. As we are only building the transformation, it is preferable to propagate failures so we know when something did not apply. + +To check or debug a transform sequence, it is possible to print various entities associated with the transform IR values. For example, we can print the operations associated with the handles: + +```mlir +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + transform.test_print_remark_at_operand %matmul, "matmul" + : !transform.op<"linalg.matmul"> + transform.test_print_remark_at_operand %elemwise, "elemwise_binaries" + : !transform.op<"linalg.elemwise_binary"> + transform.yield +} +``` + +## Transform Dialect Interpreter + +Since we don’t want to recompile the compiler every time we change a transformation, we can use a transform dialect interpreter pass to apply this transformation sequence to the payload IR. As we will see in the next chapter, it is possible to define custom passes or even integrate the transform interpreter into a larger pass. For now, we can use the existing test pass: + + +```sh +$ mlir-opt matmul.mlir --pass-pipeline=" + builtin.module(test-transform-dialect-interpreter{ + bind-first-extra-to-ops=linalg.matmul + bind-second-extra-to-ops=linalg.elemwise_binary})" +``` + +The `matmul.mlir` file contains _both_ the payload IR function _and_ the transform IR sequence nested in the same module. The transform interpreter will find the first top-level transform operation in the root operation of the pass (the module in our case) and apply it to that root operation. In our case, we also asked the interpreter pass to associate the two extra arguments of the top-level sequence with all `linalg.matmul` and `linalg.elemwise_binary` payload operations through the respective pass options. Running this pass results in the expected remarks: + +```sh +matmul.mlir:7:13: remark: matmul + %matmul = linalg.matmul + ^ +matmul.mlir:7:13: note: see current operation: %0 = linalg.matmul ins(%arg0, %arg1 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> +matmul.mlir:10:13: remark: elemwise_binaries + %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } + ^ +matmul.mlir:10:13: note: see current operation: %1 = linalg.elemwise_binary {fun = #linalg.binary_fn} ins(%0, %arg2 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> +matmul.mlir:14:13: remark: elemwise_binaries + %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } + ^ +matmul.mlir:14:13: note: see current operation: %2 = linalg.elemwise_binary {fun = #linalg.binary_fn} ins(%1, %cst : tensor<512x512xf32>, f32) outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> +``` + +Note that `%arg2` is associated with both elementwise payload operations. Any handle is associated with a list of entities. Individual transformations may or may not care about the order of elements in that list. + + +## Specifying Transformations + +Now that we have handles to the operations we want to transform, we are ready to apply the transformations. Let us first try tiling the matmul operation itself. + +```mlir +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // The actual tiling transformation takes tile sizes as attributes. + %loop, %tiled = transform.structured.tile_to_forall_op %arg1 tile_sizes [4, 32] + : (!transform.op<"linalg.matmul") -> (!transform.any_op, !transform.any_op) + transform.yield +} +``` + +The transformation returns two handles, as indicated in its [documentation](https://mlir.llvm.org/docs/Dialects/Transform/#transformstructuredtile_to_forall_op-mlirtransformtiletoforallop): + +* A handle to the `scf.forall` “multi-for” loop around tensors. +* A handle to `linalg.generic` operating on the subset of the original data. + +Running this transformation with the same command as above expectedly produces the tiled code. + +```mlir +func.func @fc_relu(%arg0: tensor<512x512xf32>, %arg1: tensor<512x512xf32>, %arg2: tensor<512x512xf32>, %arg3: tensor<512x512xf32>) -> tensor<512x512xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = scf.forall (%arg4, %arg5) in (128, 16) shared_outs(%arg6 = %arg3) -> (tensor<512x512xf32>) { + %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4) + %4 = affine.apply affine_map<(d0) -> (d0 * 32)>(%arg5) + %extracted_slice = tensor.extract_slice %arg0[%3, 0] [4, 512] [1, 1] + : tensor<512x512xf32> to tensor<4x512xf32> + %extracted_slice_0 = tensor.extract_slice %arg1[0, %4] [512, 32] [1, 1] + : tensor<512x512xf32> to tensor<512x32xf32> + %extracted_slice_1 = tensor.extract_slice %arg6[%3, %4] [4, 32] [1, 1] + : tensor<512x512xf32> to tensor<4x32xf32> + %5 = linalg.matmul + ins(%extracted_slice, %extracted_slice_0 + : tensor<4x512xf32>, tensor<512x32xf32>) + outs(%extracted_slice_1 : tensor<4x32xf32>) -> tensor<4x32xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %5 into %arg6[%3, %4] [4, 32] [1, 1] + : tensor<4x32xf32> into tensor<512x512xf32> + } + } + %1 = linalg.elemwise_binary {fun = #linalg.binary_fn} + ins(%0, %arg2 : tensor<512x512xf32>, tensor<512x512xf32>) + outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> + %2 = linalg.elemwise_binary {fun = #linalg.binary_fn} + ins(%1, %cst : tensor<512x512xf32>, f32) + outs(%arg3 : tensor<512x512xf32>) -> tensor<512x512xf32> + return %2 : tensor<512x512xf32> +} +``` + +Besides producing new handles, the tiling transform operation _consumes_ the operand handle. This means that the handle is _invalidated_ after this operation, and is no longer supposed to be used. Transform operations are required to mark all their operands as either consumed or readonly. Transform operations usually consume the operand if the associated payload operations are erased or recreated (which means erased and created anew with similar structure). As handles are essentially references to payload operations, they would become dangling if the payload no longer exists. + + +## Handle Invalidation and Expensive Checks Mode + +Undefined behavior is difficult to grapple with when it does happen, so the transform dialect interpreter provides a set of additional expensive checks that detect most undefined behavior in the transform IR. For example, if we wanted to use the `%arg1` handle after it is consumed, it would cause undefined behavior that manifests as an assertion in the debug build, and likely as a segmentation fault in the release mode. + +```mlir +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // The actual tiling transformation takes tile sizes as attributes. + %loop, %tiled = transform.structured.tile_to_forall_op %arg1 tile_sizes [4, 32] + : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) + + // This is trying to use an invalidated handle leading to undefined behavior. + transform.test_print_remark_at_operand %arg1, "remark" : !transform.op<"linalg.matmul"> + transform.yield +} +``` + +However, with the expensive checks enabled in the interpreter, a nice diagnostic is produced: + +```sh +$ mlir-opt matmul.mlir --pass-pipeline=" + builtin.module(test-transform-dialect-interpreter{ + bind-first-extra-to-ops=linalg.matmul + bind-second-extra-to-ops=linalg.elemwise_binary, + enable-expensive-checks})" +``` + +```sh +matmul.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op + transform.test_print_remark_at_operand %mm, "elemwise_binaries" : !transform.any_op + ^ +matmul.mlir:26:9: note: handle to invalidated ops + %mm = transform.cast %matmul : !transform.op<"linalg.matmul"> to !transform.any_op + ^ +matmul.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them + %loop, %tiled = transform.structured.tile_to_forall_op %mm tile_sizes [4, 32] +``` + +One may observe that some operations such as `transform.cast` do not consume the operand (because they don’t erase the corresponding operation). So what would happen if we tried to use that operand instead? + +```mlir +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // We can cast one type to another as long as operations are compatible + // with both types. This creates "aliasing" handles. + %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> + to !transform.any_op + + // The actual tiling transformation takes tile sizes as attributes. + %loop, %tiled = transform.structured.tile_to_forall_op %arg1 tile_sizes [4, 32] + : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) + + // Consuming an operand invalidates the consumed handle and any other handle that is + // associated with the same payload operations, or payload operations nested in them. + transform.test_print_remark_at_operand %casted, "remark" + : !transform.any_op + transform.yield +} +``` + +Both `%arg1` and `%casted` reference the same payload operation. Extending the reference analogy, these references alias. Naturally, when the payload operation is erased, all references to it become dangling. This is also the case for handles. In fact, consuming an operand invalidates the operand handle as well as any other handle that is associated with any of the same payload operations. The payload IR consideration is recursive: a handle associated with a payload operation _nested_ in the erased one is also invalidated (because erasing the operation also erases its regions and all contained operations). The expensive-checks mode can also handle this case. + +```sh +matmul.mlir:28:3: error: op uses a handle invalidated by a previously executed transform op + transform.test_print_remark_at_operand %matmul, "elemwise_binaries" : !transform.op<"linalg.matmul"> + ^ +matmul.mlir:21:29: note: handle to invalidated ops +^bb0(%root: !transform.any_op, %matmul: !transform.op<"linalg.matmul">, %elemwise: !transform.op<"linalg.elemwise_binary">): + ^ +matmul.mlir:27:19: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them + %loop, %tiled = transform.structured.tile_to_forall_op %mm tile_sizes [4, 32] +``` + +## Chaining Transformations with Handles + +Going back to the transformation sequence, we have tiled the matrix multiplication, but we also want to tile and fuse the elementwise operations. The typical way of doing in the structured operations paradigm is to tile the last operation in some acyclic dataflow graph, and then progressively fuse the operations that produce its operands. This removes the need to explicitly tile all operations as fusion can adapt their sizes and inject recomputation if desired. So instead of tiling the matmul operation, we are going to tile the last operation in the chain, and then fuse the preceding operations into the loops produced by tiling. + +```mlir +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 + : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %loop, %tiled = transform.structured.tile_to_forall_op %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> !transform.any_op + %matmul_fused = transform.structured.fuse_into_containing_op %arg1 into %loop + : (!transform.op<"linalg.matmul">, !transform.any_op) -> !transform.any_op + + transform.yield +} + +``` + +This achieves the desired tiling and fusion. + +## More Handle Invalidation + +Finally, let us assume there exists an efficient microkernel, or a hardware instruction expressed as an intrinsic function, for a 4x4 matrix multiplication. For this purpose, we need to tile the fused operation to the desired size, and then outline it. The resulting function call can then be replaced with a call to the microkernel. + +```mlir +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %loop, %tiled = transform.structured.tile_to_forall_op %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> !transform.any_op + %matmul_fused = transform.structured.fuse_into_containing_op %arg1 into %loop + : (!transform.op<"linalg.matmul">, !transform.any_op) -> !transform.any_op + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the transform + // dialect. Otherwise, it is difficult to differentiate "add" and "max", both + // of which having the same kind. + %loop_2, %tiled_2 = transform.structured.tile_to_forall_op %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused into %loop_2 + : (!transform.any_op, !transform.any_op) -> !transform.any_op + + // Since outlining is currently only implemented for region-holding operations + // such as loops, use tiling to size 1 to materialize the outer loop that is + // going to be outlined. + %outline_target, %_ = transform.structured.tile_to_forall_op %tiled_2 tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target + : (!transform.any_op, !transform.any_op) -> !transform.any_op + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) + + transform.yield +} + +``` + +This additional transformation also illustrates handle invalidation for nested operations. The `transform.loop.outline` operation consumes the handle to the loop, which invalidates it and all handles to any operations nested in it, such as `%2`. Attempting to use this handle will cause undefined behavior. (Note that it isn’t strictly necessary for this specific form of the outlining to consume the operand as the implementation only _moves_ the region without recreating the operations, but the author of the transformation chose to invalidate the handle anyway.) + +Attempting to access the fusion result after outlining produces the following error + +```sh +test/Examples/transform/Ch1/invalidation-2.mlir:109:3: error: op uses a handle invalidated by a previously executed transform op + transform.test_print_remark_at_operand %outline_target, "outlined loop" : !transform.any_op + ^ +test/Examples/transform/Ch1/invalidation-2.mlir:102:25: note: handle to invalidated ops + %outline_target, %_ = transform.structured.tile_to_forall_op %tiled_2 tile_sizes [1] + ^ +test/Examples/transform/Ch1/invalidation-2.mlir:106:18: note: invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + ^ +test/Examples/transform/Ch1/invalidation-2.mlir:24:13: note: ancestor payload op + %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } + ^ +test/Examples/transform/Ch1/invalidation-2.mlir:24:13: note: nested payload op + %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) +``` + +Note that the “add” elementwise operation is indicated as payload ancestor because it was used to produce the tile loop, and the loop therefore has its location. + +Finally, we would like to replace the call to the outlined function with a call to the microkernel. Unfortunately, the Transform dialect doesn’t have support for this transformation (and cannot have if the call is rewritten to a custom, out-of-tree operation). Therefore, we need to define new transform operations. The next chapters will describe how this can be done. diff --git a/mlir/docs/Tutorials/transform/Ch2.md b/mlir/docs/Tutorials/transform/Ch2.md new file mode 100644 index 0000000000000..2649be0b09ef3 --- /dev/null +++ b/mlir/docs/Tutorials/transform/Ch2.md @@ -0,0 +1,327 @@ +# Chapter 2: Adding a Simple New Transformation Operation + +## Setting Up to Add New Transformations + +Before defining a new transform operation, we need to choose where its implementation should be located. While MLIR encourages upstream contributions, it is not always possible or even desirable to modify the main Transform dialect, for example, if the transformation is specific to some out-of-tree dialect that is not itself available upstream. + +The Transform dialect uses the dialect extension mechanism to allow additional operations to be injected without modifying the dialect itself. Dialect extensions are registered with the context and loaded when the dialect itself is loaded. Extension definition is straightforward: + +```cpp +// In MyExtension.cpp. +#include "mlir/Dialect/Transform/IR/TransformDialect.h" + +// Define a new transform dialect extension. This uses the CRTP idiom to identify +// extensions. +class MyExtension : public ::mlir::transform::TransformDialectExtension { +public: + // The extension must derive the base constructor. + using Base::Base; + + // This function initializes the extension, similarly to `initialize` in dialect + // definitions. List individual operations and dependent dialects here. + void init(); +}; + +void MyExtension::init() { + // Similarly to dialects, an extension can declare a dependent dialect. This dialect + // will be loaded along with the extension and, therefore, along with the Transform + // dialect. Only declare as dependent the dialects that contain the attributes or + // types used by transform operations. Do NOT declare as dependent the dialects + // produced during the transformation. + // declareDependentDialect(); + + // When transformations are applied, they may produce new operations from previously + // unloaded dialects. Typically, a pass would need to declare itself dependent on + // the dialects containing such new operations. To avoid confusion with the dialects + // the extension itself depends on, the Transform dialects differentiates between: + // - dependent dialects, which are used by the transform operations, and + // - generated dialects, which contain the entities (attributes, operations, + // types) that may be produced by applying the transformation even when not + // present in the original payload IR. + // In the following chapter, we will be add operations that generate function calls + // and structured control flow operations, so let's declare the corresponding + // dialects as generated. + declareGeneratedDialect<::mlir::scf::SCFDialect>(); + declareGeneratedDialect<::mlir::func::FuncDialect>(); + + // Finally, we register the additional transform operations with the dialect. + registerTransformOps< + // TODO: list the operation classes. + >(); +} +``` + +The operations themselves can be defined using ODS, exactly in the same way as regular operations in a dialect. + +```tablegen +// In MyExtension.td +#ifndef MY_EXTENSION +#define MY_EXTENSION + +include "mlir/Dialect/Transform/IR/TransformDialect.td" +include "mlir/Dialect/Transform/IR/TransformInterfaces.td" +include "mlir/IR/OpBase.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +def MyOp : Op { + let summary = "my transform op"; + // TODO: define the operation properties. +} + +#endif // MY_EXTENSION +``` + +Similarly to dialects, we must use Tablegen to generate the header and implementation of these operations. We can instruct CMake to do it as follows. + + +```sh +# In CMakeLists.txt next to MyExtension.td. + +# Tell Tablegen to use MyExtension.td as input. +set(LLVM_TARGET_DEFINITIONS MyExtension.td) + +# Ask Tablegen to generate op declarations and definitions from ODS. +mlir_tablegen(MyExtension.h.inc -gen-op-decls) +mlir_tablegen(MyExtension.cpp.inc -gen-op-defs) + +# Add a CMakeTarget we can depend on to ensure the generation happens before the compilation. +add_public_tablegen_target(MyExtensionIncGen) + +# Don't forget to generate the documentation, this will produce a MyExtension.md under +# Dialects. +add_mlir_doc(MyExtension MyExtension Dialects/ -gen-op-doc) +``` + +```sh +# In CMakeLists.txt next to MyExtension.cpp +add_mlir_library( + # Library called MyExtension. + MyExtension + + # Built from the following source files. + MyExtension.cpp + + # Make sure ODS declaration and definitions are generated before compiling this. + DEPENDS + MyExtensionIncGen + + # Link in the transform dialect, and all generated dialects. + LINK_LIBS PUBLIC + MLIRTransformDialect + MLIRFuncDialect + MLIRSCFDialect +) +``` + +This will generate two files, `MyExtension.h.inc` and `MyExtension.cpp.inc`, that are supposed to be included into the declaration and definition of the transform operations, respectively. + +```c++ +// In MyExtension.h. +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" + +#define GET_OP_CLASSES +#include "MyExtension.h.inc" +``` + +```c++ +// In MyExtension.cpp. + +#define GET_OP_CLASSES +#include "MyExtension.cpp.inc" + +// … +void MyExtension::init() { + // … + + // Finally, we register the additional transform operations with the dialect. List all + // operations generated from ODS. This call will perform additional checks that the + // operations implement the transform and memory effect interfaces required by the + // dialect interpreter and assert if they do not. + registerTransformOps< +#define GET_OP_LIST +#include "MyExtension.cpp.inc" + >(); +} +``` + +## Defining a Transform Operation + +With this setup, we are now ready to define the new transform operation to rewrite the function call. This is identical to defining a regular operation in a dialect. Note that the Transform dialect requires operations to implement the `TransformOpInterface` as well as `MemoryEffectsOpInterface` to indicate whether the operands are consumed or only read. Our operation can be defined along the following lines. + +```tablegen +// In MyExtension.td. + +// Define the new operation. By convention, prefix its name with the name of the dialect +// extension, "my.". The full operation name will be further prefixed with "transform.". +def ChangeCallTargetOp : Op, + DeclareOpInterfaceMethods]> { + // Provide a brief and a full description. It is recommended that the latter describes + // the effects on the operands and how the operation processes various failure modes. + let summary = "Changes the callee of a call operation to the specified one"; + let description = [{ + For each `func.call` payload operation associated with the handle, changes its + callee to be the symbol whose name is provided as an attribute to this operation. + + Generates a silenceable failure if the operand is associated with payload operations + that are not `func.call`. + Only reads the operand. + }]; + + // The arguments include the handle to the payload operations and the attribute that + // specifies the new callee. The handle must implement TransformHandleTypeInterface. + // We use a string attribute as the symbol may not exist in the transform IR so the + // verification may fail. + let arguments = (ins + TransformHandleTypeInterface:$call, + StrAttr:$new_target); + + // The results are empty as the transformation does not produce any new payload. + let results = (outs); + + // Provide nice syntax. + let assemblyFormat = "$call `,` $new_target attr-dict `:` type($call)"; +} +``` + +To finalize the definition of the transform operation, we need to implement the interface methods. The `TransformOpInterface` currently requires only one method – `apply` – that performs the actual transformation. It is a good practice to limit the body of the method to manipulation of the Transform dialect constructs and have the actual transformation implemented as a standalone function so it can be used from other places in the code. + + +```c++ +// In MyExtension.cpp + +// Implementation of our transform dialect operation. +// This operation returns a tri-state result that can be one of: +// - success when the transformation succeeded; +// - definite failure when the transformation failed in such a way that following +// transformations are impossible or undesirable, typically it could have left payload +// IR in an invalid state; it is expected that a diagnostic is emitted immediately +// before returning the definite error; +// - silenceable failure when the transformation failed but following transformations +// are still applicable, typically this means a precondition for the transformation is +// not satisfied and the payload IR has not been modified. +// The silenceable failure additionally carries a Diagnostic that can be emitted to the +// user. +::mlir::DiagnosedSilenceableFailure ChangeCallTargetOp::apply( + // The list of payload IR entities that will be associated with the transform IR + // values defined by this transform operation. In this case, it can remain empty as + // there are no results. + ::mlir::transform::TransformResults &results, + // The transform application state. This object can be used to query the current + // associations between transform IR values and payload IR entities. It can also + // carry additional user-defined state. + ::mlir::transform::TransformState &state) { + + // First, we need to obtain the list of payload operations that are associated with + // the operand handle. + auto payload = state.getPayloadOps(getCall()); + + // Then, we iterate over the list of operands and call the actual IR-mutating + // function. We also check the preconditions here. + for (Operation *payloadOp : payload) { + auto call = dyn_cast<::mlir::func::CallOp>(payloadOp); + if (!call) { + DiagnosedSilenceableFailure diag = emitSilenceableError() + << "only applies to func.call payloads"; + diag.attachNote(payloadOp->getLoc()) << "offending payload"; + return diag; + } + + updateCallee(call, getNewTarget()); + } + + // If everything went well, return success. + return DiagnosedSilenceableFailure::success(); +} +``` + +The implementation of the `MemoryEffectsOpInterface` must specify the effects this operation has on its operands (consumed or readonly) and on the payload IR (mutates or readonly). Transform dialect verifiers will check for side effects being present and assert in debug builds if they are not. + +```c++ +// In MyExtension.cpp + +void ChangeCallTargetOp::getEffects( + ::llvm::SmallVectorImpl<::mlir::MemoryEffects::EffectInstance> &effects) { + // Indicate that the `call` handle is only read by this operation because the + // associated operation is not erased but rather modified in-place, so the + // reference to it remains valid. + onlyReadsHandle(getCall(), effects); + + // Indicate that the payload is modified by this operation. + modifiesPayload(effects); +} +``` + +## Registration and Usage + +This is enough to define transform operations. The only remaining bit is providing the extension registration hook that can be called from the project’s `main`. + + +```c++ +// In TransformDialect.cpp (don't forget a declaration in TransformDialect.h); + +void registerMyExtension(::mlir::DialectRegistry ®istry) { + registry.addExtensions(); +} +``` + +After registering the extension, it becomes possible to use our new operation in the transform dialect interpreter. The upstream testing pass can be used as is. + +```mlir +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %loop, %tiled = transform.structured.tile_to_forall_op %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> !transform.any_op + %matmul_fused = transform.structured.fuse_into_containing_op %arg1 into %loop + : (!transform.op<"linalg.matmul">, !transform.any_op) -> !transform.any_op + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the transform + // dialect. Otherwise, it is difficult to differentiate "add" and "max", both + // of which having the same kind. + %loop_2, %tiled_2 = transform.structured.tile_to_forall_op %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused into %loop_2 + : (!transform.any_op, !transform.any_op) -> !transform.any_op + + // Since outlining is currently only implemented for region-holding operations + // such as loops, use tiling to size 1 to materialize the outer loop that is + // going to be outlined. + %outline_target, %_ = transform.structured.tile_to_forall_op %tiled_2 tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target + : (!transform.any_op, !transform.any_op) -> !transform.any_op + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Rewrite the call target. + transform.my.change_call_target %call, "microkernel" : !transform.any_op + + transform.yield +} +``` diff --git a/mlir/docs/Tutorials/transform/Ch3.md b/mlir/docs/Tutorials/transform/Ch3.md new file mode 100644 index 0000000000000..4a8c17a73e59f --- /dev/null +++ b/mlir/docs/Tutorials/transform/Ch3.md @@ -0,0 +1,283 @@ +# Chapter 3: More than Simple Transform Operations + +## Type Constraints and ApplyEach Trait + +A transform operation that applies to each payload operation individually and requires it to be of a specific kind is a repeated pattern. One can use Transform dialect types to specify the preconditions of the type. Specifically, we can change the expected operand type from the wide `TransformHandleTypeInterface` to the more narrow `Transform_ConcreteOp<"func.call">`. Furthermore, we use the `TransformEachOpTrait` trait to provide the skeleton implementation of the `apply` method that performs verification, iteration over payloads and result concatenation. The improved ODS op definition is as follows. + +```tablegen +// In MyExtension.td. + +// Define the new operation. By convention, prefix its name with the name of the dialect extension, "my.". The full operation name will be further prefixed with "transform.". +def ChangeCallTargetOp : Op]> { + // Provide a brief and a full description. It is recommended that the latter describes + // the effects on the operands and how the operation processes various failure modes. + let summary = "Changes the callee of a call operation to the specified one"; + let description = [{ + For each `func.call` payload operation associated with the handle, changes its + callee to be the symbol whose name is provided as an attribute to this operation. + + Generates a silenceable failure if the operand is associated with payload operations + that are not `func.call`. + Only reads the operand. + }]; + + // The arguments include the handle to the payload operations and the attribute that + // specifies the new callee. The handle must implement TransformHandleTypeInterface. + // We use a string attribute as the symbol may not exist in the transform IR so the + // verification may fail. + let arguments = (ins + Transform_ConcreteOpType<"func.call">:$call, + StrAttr:$new_target); + + // The results are empty as the transformation does not produce any new payload. + let results = (outs); + + // Provide nice syntax. + let assemblyFormat = "$call `,` $new_target attr-dict `:` type($call)"; + + // Declare the function implementing the interface for a single payload operation. + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::func::CallOp call, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} +``` + +Now, instead of defining the `apply` method with a loop, we can simply define a function that applies to an individual payload operation and the trait will take care of the rest. + +```c++ +::mlir::DiagnosedSilenceableFailure ChangeCallTargetOp::applyToOne( + ::mlir::func::CallOp call,, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state) { + // Call the actual transformation function. + updateCallee(call, getNewTarget()); + // Indicate success. + return DiagnosedSilenceableFailure::success(); +} +``` + +## Defining a Transform Type + +In addition to operations, the Transform dialect allows extensions to define and inject additional attributes and types. As we have seen above, transform types are used to specify constraints on the payload operations. Our call rewriting operation currently applies only to `func.call`. We may want to generalize it to apply to any payload operation that implements `CallOpInterface`, but the Transform dialect currently doesn’t provide a type that checks if a payload operation implements this interface. Let’s define it in our extension. + +Type definition is again identical to defining a dialect type with ODS. + +```tablegen +// Transform dialect allows additional types to be defined and injected. +def CallOpInterfaceHandle + : TypeDef]> { + + // The usual components of a type such as description, mnemonic and assembly format + // should be provided. + let summary = "handle to payload operations implementing CallOpInterface"; + let mnemonic = "my.call_op_interface"; + let assemblyFormat = ""; +} +``` + +We will omit the generation of declaration and definitions using Tablegen for brevity as it is identical to the regular case. + +To finalize the definition of a transform type, one must implement the interface methods. + +```c++ +// In MyExtension.cpp. + +// The interface declares this method to verify constraints this type has on +// payload operations. It returns the now familiar tri-state result. +mlir::DiagnosedSilenceableFailure +mlir::transform::CallOpInterfaceHandleType::checkPayload( + // Location at which diagnostics should be emitted. + mlir::Location loc, + // List of payload operations that are about to be associated with the + // handle that has this type. + llvm::ArrayRef payload) const { + + // All payload operations are expected to implement CallOpInterface, check this. + for (Operation *op : payload) { + if (llvm::isa(op)) + continue; + + // By convention, these verifiers always emit a silenceable failure since they are + // checking a precondition. + DiagnosedSilenceableFailure diag = emitSilenceableError(loc) + << "expected the payload operation to implement CallOpInterface"; + diag.attachNote(op->getLoc()) << "offending operation"; + return diag; + } + + // If everything is okay, return success. + return DiagnosedSilenceableFailure::success(); +} + +``` + +Additional attributes and types need to be registered in the extension, next to operations. + +```c++ +// In MyExtension.cpp. + +void MyExtension::init() { + // … + + registerTypes< +#define GET_TYPEDEF_LIST +#include "MyExtensionTypes.cpp.inc" + >(); +} +``` + +This type is now directly available in the transform dialect and can be used in operations. + + +```mlir + // Cast to our new type. + %casted = transform.cast %call : !transform.any_op to !transform.my.call_op_interface + // Using our new operation. + transform.my.change_call_target %casted, "microkernel" : !transform.my.call_op_interface +``` + +## Operand Consumption + +As an exercise, let us modify the rewriting operation to consume the operand. This would be necessary, for example, if the transformation were to rewrite the `func.call` operation into a custom operation `my.mm4`. Since the operand handle is now consumed, the operation can return a new handle to the newly produced payload operation. Otherwise, the ODS definition of the transform operation remains unchanged. + + +```tablegen +// In MyExtension.td. + +// Define another transform operation. +def CallToOp : Op]> { + // Summary and description omitted for brevity. + + // The argument is the handle to the payload operations. + let arguments = (ins CallOpInterfaceHandle:$call); + + // The result is the handle to the payload operations produced during the + // transformation. + let results = (outs TransformHandleTypeInterface:$transformed); + + // Provide nice syntax. + let assemblyFormat = "$call attr-dict `:` functional-type(inputs, outputs)"; + + // Declare the function implementing the interface for a single payload operation. + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::CallOpInterface call, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} +``` + +Now let’s look at the implementation of interface methods. + +```c++ +// In MyExtension.cpp. + +::mlir::DiagnosedSilenceableFailure CallToOp::applyToOne( + ::mlir::CallOpInterface call, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state) { + // Call the actual rewrite. + Operation *rewritten = rewriteToOp(call); + + // Report an error if the rewriter produced a null pointer. Note that it may have + // irreversibly modified the payload IR, so we produce a definite failure. + if (!rewritten) { + return emitDefiniteError() << "failed to rewrite call to operation"; + } + + // On success, push the resulting operation into the result list. The list is expected + // to contain exactly one entity per result and per application. The handles will be + // associated with lists of the respective values produced by each application. + results.push_back(rewritten); + + // If everything is fine, return success. + return DiagnosedSilenceableFailure::success(); +} + +void CallToOp::getEffects( + ::llvm::SmallVectorImpl<::mlir::MemoryEffects::EffectInstance> &effects) { + // Indicate using side effects that the operand handle is consumed, and the + // result handle is produced. + consumesHandle(getCall(), effects); + producesHandle(getTransformed(), effects); + + // Indicate that the payload IR is modified. + modifiesPayload(effects); +} +``` + +The overall flow of these implementations is similar to the previous one. The application also needs to specify the resulting entities that are going to be associated with the handles it produces. Operations are required to specify the entities to associate with _all_ results on success, even if the list is empty. An assertion will be triggered if it is not the case. In case of failure, the interpreter will automatically associate all results that are not yet defined with empty lists. + +Note that since `applyToOne` always expects one payload entity to be associated with each result handle in each application, it cannot be used to return handles associated with empty lists for non-empty operand handles. Instead, one would use `apply` directly. + +```c++ +::mlir::DiagnosedSilenceableFailure SomeOtherOp::apply( + ::mlir::transform::TransformResults &results, + ::mlir::transform::TransformState &state) { + // ... + + // Associate the result `transformed` with an empty list of payload operations. + results.set(cast(getTransformed()), {}); + return DiagnosedSilenceableFailure::success(); +} +``` + +## Memory Effects Traits + +Some common memory effect patterns are also available as traits to minimize the boilerplate. + +* `FunctionalStyleTransformOpTrait` indicates that all handle-typed operands are consumed, all results are produced, and the payload IR is modified. +* `NavigationTransformOpTrait` indicates that all handle-typed operands are only read, all results are produced, and the payload IR is only read. + +Using these traits removes the need to declare or define the methods of the `MemoryEffectsOpInterface`. + +```tablegen +// In MyExtension.td. + +// Define another transform operation. +def CallToOp : Op { + // Summary and description omitted for brevity. + + // The argument is the handle to the payload operations. + let arguments = (ins CallOpInterfaceHandle:$call); + + // The result is the handle to the payload operations produced during the + // transformation. + let results = (outs TransformHandleTypeInterface:$transformed); + + // Provide nice syntax. + let assemblyFormat = "$call attr-dict `:` functional-type(operands, results)"; + + // Declare the function implementing the interface for a single payload operation. + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::CallOpInterface call, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} +``` + + diff --git a/mlir/docs/Tutorials/transform/_index.md b/mlir/docs/Tutorials/transform/_index.md new file mode 100644 index 0000000000000..bb09cdee212b8 --- /dev/null +++ b/mlir/docs/Tutorials/transform/_index.md @@ -0,0 +1,32 @@ +# Transform Dialect Tutorial + +MLIR supports declarative specification for controlling compiler transformations +via the transform dialect. It allows one to request compiler transformations +using compiler IR itself, which can be embedded into the original IR that is +being transformed (similarly to pragmas) or supplied separately (similarly to +scheduling languages). This tutorial presents the concepts of the MLIR transform +dialect and related infrastructure. It will be accompanied by a practical +demonstration of three use scenarios: + +- Composing transform dialect operations available in (upstream) MLIR to perform + a sequence of optimizing transformations that results in efficient code for an + MLIR linear algebra operation. +- Defining new transform dialect operations and adapting existing transformation + code to work with the transform dialect infrastructure. +- Setting up and using the transform dialect infrastructure in a downstream + out-of-tree project with custom dialects, transformations and passes. + +After following the tutorial, one will be able to apply the transform dialect in +their work and extend it when necessary. Basic familiarity with MLIR is a +prerequisite. See [Toy tutorial](../Toy) for introduction to MLIR. + +The tutorial is divided into the following chapters. + +- [Chapter #0](Ch0.md): A Primer on “Structured” Linalg Operations +- [Chapter #1](Ch1.md): Combining Existing Transformations +- [Chapter #2](Ch2.md): Adding a Simple New Transformation Operation +- [Chapter #3](Ch3.md): More than Simple Transform Operations + +The code corresponding to this tutorial is located under +`mlir/Examples/transform` and the corresponding tests in +`mlir/test/Examples/transform`. diff --git a/mlir/examples/CMakeLists.txt b/mlir/examples/CMakeLists.txt index 37c89d0bae965..bdbba3ead9abf 100644 --- a/mlir/examples/CMakeLists.txt +++ b/mlir/examples/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(toy) +add_subdirectory(transform) diff --git a/mlir/examples/transform/CMakeLists.txt b/mlir/examples/transform/CMakeLists.txt new file mode 100644 index 0000000000000..3f3740ad2a8da --- /dev/null +++ b/mlir/examples/transform/CMakeLists.txt @@ -0,0 +1,4 @@ +add_custom_target(TransformExample) + +add_subdirectory(Ch2) +add_subdirectory(Ch3) diff --git a/mlir/examples/transform/Ch2/CMakeLists.txt b/mlir/examples/transform/Ch2/CMakeLists.txt new file mode 100644 index 0000000000000..56aefefff754c --- /dev/null +++ b/mlir/examples/transform/Ch2/CMakeLists.txt @@ -0,0 +1,20 @@ +# For a better top-level template to copy, see examples/standalone. + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + +add_subdirectory(include) +add_subdirectory(lib) + +add_dependencies(TransformExample transform-opt-ch2) +add_llvm_example(transform-opt-ch2 + transform-opt/transform-opt.cpp) + +target_link_libraries(transform-opt-ch2 + PRIVATE + MLIRIR + MLIRMlirOptMain + MLIRSideEffectInterfaces + MyExtensionCh2 +) diff --git a/mlir/examples/transform/Ch2/include/CMakeLists.txt b/mlir/examples/transform/Ch2/include/CMakeLists.txt new file mode 100644 index 0000000000000..fcdc68a160f4b --- /dev/null +++ b/mlir/examples/transform/Ch2/include/CMakeLists.txt @@ -0,0 +1,12 @@ +# Tell Tablegen to use MyExtension.td as input. +set(LLVM_TARGET_DEFINITIONS MyExtension.td) + +# Ask Tablegen to generate op declarations and definitions from ODS. +mlir_tablegen(MyExtension.h.inc -gen-op-decls) +mlir_tablegen(MyExtension.cpp.inc -gen-op-defs) + +# Add a CMakeTarget we can depend on to ensure the generation happens before the compilation. +add_public_tablegen_target(MyExtensionCh2IncGen) + +# Don't forget to generate the documentation, this will produce a MyExtension.md under Dialects. +add_mlir_doc(MyExtension MyExtensionCh2 Dialects/ -gen-op-doc) diff --git a/mlir/examples/transform/Ch2/include/MyExtension.h b/mlir/examples/transform/Ch2/include/MyExtension.h new file mode 100644 index 0000000000000..03a24a190e15e --- /dev/null +++ b/mlir/examples/transform/Ch2/include/MyExtension.h @@ -0,0 +1,22 @@ +//===-- MyExtension.h - Transform dialect tutorial --------------*- c++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines Transform dialect extension operations used in the +// Chapter 2 of the Transform dialect tutorial. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" + +#define GET_OP_CLASSES +#include "MyExtension.h.inc" + +// Registers our Transform dialect extension. +void registerMyExtension(::mlir::DialectRegistry ®istry); diff --git a/mlir/examples/transform/Ch2/include/MyExtension.td b/mlir/examples/transform/Ch2/include/MyExtension.td new file mode 100644 index 0000000000000..4824b83e6c184 --- /dev/null +++ b/mlir/examples/transform/Ch2/include/MyExtension.td @@ -0,0 +1,56 @@ +//===-- MyExtension.td - Transform dialect tutorial --------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines Transform dialect extension operations used in the +// Chapter 2 of the Transform dialect tutorial. +// +//===----------------------------------------------------------------------===// + +#ifndef MY_EXTENSION +#define MY_EXTENSION + +include "mlir/Dialect/Transform/IR/TransformDialect.td" +include "mlir/Dialect/Transform/IR/TransformInterfaces.td" +include "mlir/IR/OpBase.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +// Define the new operation. By convention, prefix its name with the name of the dialect +// extension, "my.". The full operation name will be further prefixed with "transform.". +def ChangeCallTargetOp : Op, + DeclareOpInterfaceMethods]> { + // Provide a brief and a full description. It is recommended that the latter describes + // the effects on the operands and how the operation processes various failure modes. + let summary = "Changes the callee of a call operation to the specified one"; + let description = [{ + For each `func.call` payload operation associated with the handle, changes its + callee to be the symbol whose name is provided as an attribute to this operation. + + Generates a silenceable failure if the operand is associated with payload operations + that are not `func.call`. + Only reads the operand. + }]; + + // The arguments include the handle to the payload operations and the attribute that + // specifies the new callee. The handle must implement TransformHandleTypeInterface. + // We use a string attribute as the symbol may not exist in the transform IR so the + // verification may fail. + let arguments = (ins + TransformHandleTypeInterface:$call, + StrAttr:$new_target); + + // The results are empty as the transformation does not produce any new payload. + let results = (outs); + + // Provide nice syntax. + let assemblyFormat = "$call `,` $new_target attr-dict `:` type($call)"; +} + +#endif // MY_EXTENSION diff --git a/mlir/examples/transform/Ch2/lib/CMakeLists.txt b/mlir/examples/transform/Ch2/lib/CMakeLists.txt new file mode 100644 index 0000000000000..11f0557457380 --- /dev/null +++ b/mlir/examples/transform/Ch2/lib/CMakeLists.txt @@ -0,0 +1,21 @@ +add_mlir_library( + # Library called MyExtension. + MyExtensionCh2 + + # Built from the following source files. + MyExtension.cpp + + # Make includes visible without top-level path. + ADDITIONAL_HEADER_DIRS + ${PROJECT_SOURCE_DIR}/examples/transform/Ch2/include + + # Make sure ODS declaration and definitions are generated before compiling this. + DEPENDS + MyExtensionCh2IncGen + + # Link in the transform dialect, an all generated dialects. + LINK_LIBS PUBLIC + MLIRTransformDialect + MLIRFuncDialect + MLIRSCFDialect +) diff --git a/mlir/examples/transform/Ch2/lib/MyExtension.cpp b/mlir/examples/transform/Ch2/lib/MyExtension.cpp new file mode 100644 index 0000000000000..5eda317db0a4e --- /dev/null +++ b/mlir/examples/transform/Ch2/lib/MyExtension.cpp @@ -0,0 +1,132 @@ +//===-- MyExtension.cpp - Transform dialect tutorial ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines Transform dialect extension operations used in the +// Chapter 2 of the Transform dialect tutorial. +// +//===----------------------------------------------------------------------===// + +#include "MyExtension.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" + +// Define a new transform dialect extension. This uses the CRTP idiom to +// identify extensions. +class MyExtension + : public ::mlir::transform::TransformDialectExtension { +public: + // The extension must derive the base constructor. + using Base::Base; + + // This function initializes the extension, similarly to `initialize` in + // dialect definitions. List individual operations and dependent dialects + // here. + void init(); +}; + +void MyExtension::init() { + // Similarly to dialects, an extension can declare a dependent dialect. This + // dialect will be loaded along with the extension and, therefore, along with + // the Transform dialect. Only declare as dependent the dialects that contain + // the attributes or types used by transform operations. Do NOT declare as + // dependent the dialects produced during the transformation. + // declareDependentDialect(); + + // When transformations are applied, they may produce new operations from + // previously unloaded dialects. Typically, a pass would need to declare + // itself dependent on the dialects containing such new operations. To avoid + // confusion with the dialects the extension itself depends on, the Transform + // dialects differentiates between: + // - dependent dialects, which are used by the transform operations, and + // - generated dialects, which contain the entities (attributes, operations, + // types) that may be produced by applying the transformation even when + // not present in the original payload IR. + // In the following chapter, we will be add operations that generate function + // calls and structured control flow operations, so let's declare the + // corresponding dialects as generated. + declareGeneratedDialect<::mlir::scf::SCFDialect>(); + declareGeneratedDialect<::mlir::func::FuncDialect>(); + + // Finally, we register the additional transform operations with the dialect. + // List all operations generated from ODS. This call will perform additional + // checks that the operations implement the transform and memory effect + // interfaces required by the dialect interpreter and assert if they do not. + registerTransformOps< +#define GET_OP_LIST +#include "MyExtension.cpp.inc" + >(); +} + +#define GET_OP_CLASSES +#include "MyExtension.cpp.inc" + +static void updateCallee(mlir::func::CallOp call, llvm::StringRef newTarget) { + call.setCallee(newTarget); +} + +// Implementation of our transform dialect operation. +// This operation returns a tri-state result that can be one of: +// - success when the transformation succeeded; +// - definite failure when the transformation failed in such a way that +// following +// transformations are impossible or undesirable, typically it could have left +// payload IR in an invalid state; it is expected that a diagnostic is emitted +// immediately before returning the definite error; +// - silenceable failure when the transformation failed but following +// transformations +// are still applicable, typically this means a precondition for the +// transformation is not satisfied and the payload IR has not been modified. +// The silenceable failure additionally carries a Diagnostic that can be emitted +// to the user. +::mlir::DiagnosedSilenceableFailure mlir::transform::ChangeCallTargetOp::apply( + // The list of payload IR entities that will be associated with the + // transform IR values defined by this transform operation. In this case, it + // can remain empty as there are no results. + ::mlir::transform::TransformResults &results, + // The transform application state. This object can be used to query the + // current associations between transform IR values and payload IR entities. + // It can also carry additional user-defined state. + ::mlir::transform::TransformState &state) { + + // First, we need to obtain the list of payload operations that are associated + // with the operand handle. + auto payload = state.getPayloadOps(getCall()); + + // Then, we iterate over the list of operands and call the actual IR-mutating + // function. We also check the preconditions here. + for (Operation *payloadOp : payload) { + auto call = dyn_cast<::mlir::func::CallOp>(payloadOp); + if (!call) { + DiagnosedSilenceableFailure diag = + emitSilenceableError() << "only applies to func.call payloads"; + diag.attachNote(payloadOp->getLoc()) << "offending payload"; + return diag; + } + + updateCallee(call, getNewTarget()); + } + + // If everything went well, return success. + return DiagnosedSilenceableFailure::success(); +} + +void mlir::transform::ChangeCallTargetOp::getEffects( + ::llvm::SmallVectorImpl<::mlir::MemoryEffects::EffectInstance> &effects) { + // Indicate that the `call` handle is only read by this operation because the + // associated operation is not erased but rather modified in-place, so the + // reference to it remains valid. + onlyReadsHandle(getCall(), effects); + + // Indicate that the payload is modified by this operation. + modifiesPayload(effects); +} + +void registerMyExtension(::mlir::DialectRegistry ®istry) { + registry.addExtensions(); +} diff --git a/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp new file mode 100644 index 0000000000000..a875f630ef1b0 --- /dev/null +++ b/mlir/examples/transform/Ch2/transform-opt/transform-opt.cpp @@ -0,0 +1,61 @@ +//===-- transform-opt.cpp - Transform dialect tutorial entry point --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the top-level file for the Transform dialect tutorial chapter 2. +// +//===----------------------------------------------------------------------===// + +#include "MyExtension.h" + +#include "mlir/IR/DialectRegistry.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/InitAllDialects.h" +#include "mlir/Tools/mlir-opt/MlirOptMain.h" +#include "mlir/Transforms/Passes.h" +#include + +// Forward declarations of test passes that used in this chapter for +// illustrative purposes. Test passes are not directly exposed for use in +// binaries other than mlir-opt, which is too big to serve as an example. +namespace mlir::test { +void registerTestTransformDialectEraseSchedulePass(); +void registerTestTransformDialectInterpreterPass(); +} // namespace mlir::test + +namespace test { +void registerTestTransformDialectExtension(mlir::DialectRegistry &); +} // namespace test + +int main(int argc, char **argv) { + // Register all "core" dialects and our transform dialect extension. + mlir::DialectRegistry registry; + mlir::registerAllDialects(registry); + registerMyExtension(registry); + + // Register a handful of cleanup passes that we can run to make the output IR + // look nicer. + mlir::registerCanonicalizerPass(); + mlir::registerCSEPass(); + mlir::registerSymbolDCEPass(); + + // Register the test passes. +#ifdef MLIR_INCLUDE_TESTS + mlir::test::registerTestTransformDialectEraseSchedulePass(); + mlir::test::registerTestTransformDialectInterpreterPass(); + test::registerTestTransformDialectExtension(registry); +#else + llvm::errs() << "warning: MLIR built without test passes, interpreter " + "testing will not be available\n"; +#endif // MLIR_INCLUDE_TESTS + + // Delegate to the MLIR utility for parsing and pass management. + return mlir::MlirOptMain(argc, argv, "transform-opt-ch2", registry) + .succeeded() + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/mlir/examples/transform/Ch3/CMakeLists.txt b/mlir/examples/transform/Ch3/CMakeLists.txt new file mode 100644 index 0000000000000..1bfe18b9c0373 --- /dev/null +++ b/mlir/examples/transform/Ch3/CMakeLists.txt @@ -0,0 +1,20 @@ +# For a better top-level template to copy, see examples/standalone. + +include_directories(${CMAKE_CURRENT_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + +add_subdirectory(include) +add_subdirectory(lib) + +add_dependencies(TransformExample transform-opt-ch3) +add_llvm_example(transform-opt-ch3 + transform-opt/transform-opt.cpp) + +target_link_libraries(transform-opt-ch3 + PRIVATE + MLIRIR + MLIRMlirOptMain + MLIRSideEffectInterfaces + MyExtensionCh3 +) diff --git a/mlir/examples/transform/Ch3/include/CMakeLists.txt b/mlir/examples/transform/Ch3/include/CMakeLists.txt new file mode 100644 index 0000000000000..32ec779ec3e1f --- /dev/null +++ b/mlir/examples/transform/Ch3/include/CMakeLists.txt @@ -0,0 +1,19 @@ +# Tell Tablegen to use MyExtension.td as input. +set(LLVM_TARGET_DEFINITIONS MyExtension.td) + +# Ask Tablegen to generate op declarations and definitions from ODS. +mlir_tablegen(MyExtension.h.inc -gen-op-decls) +mlir_tablegen(MyExtension.cpp.inc -gen-op-defs) + +# Tell Tablegen to use MyExtensionTypes.td as input. +set(LLVM_TARGET_DEFINITIONS MyExtensionTypes.td) + +# Ask Tablegen to generate type declarations and definitions from ODS. +mlir_tablegen(MyExtensionTypes.h.inc -gen-typedef-decls) +mlir_tablegen(MyExtensionTypes.cpp.inc -gen-typedef-defs) + +# Add a CMakeTarget we can depend on to ensure the generation happens before the compilation. +add_public_tablegen_target(MyExtensionCh3IncGen) + +# Don't forget to generate the documentation, this will produce a MyExtension.md under Dialects. +add_mlir_doc(MyExtension MyExtensionCh3 Dialects/ -gen-op-doc) diff --git a/mlir/examples/transform/Ch3/include/MyExtension.h b/mlir/examples/transform/Ch3/include/MyExtension.h new file mode 100644 index 0000000000000..223638eee1c03 --- /dev/null +++ b/mlir/examples/transform/Ch3/include/MyExtension.h @@ -0,0 +1,32 @@ +//===-- MyExtension.h - Transform dialect tutorial --------------*- c++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines Transform dialect extension operations used in the +// Chapter 3 of the Transform dialect tutorial. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Transform/IR/TransformInterfaces.h" + +namespace mlir { +class CallOpInterface; +namespace func { +class CallOp; +} // namespace func +} // namespace mlir + +#define GET_TYPEDEF_CLASSES +#include "MyExtensionTypes.h.inc" + +#define GET_OP_CLASSES +#include "MyExtension.h.inc" + +// Registers our Transform dialect extension. +void registerMyExtension(::mlir::DialectRegistry ®istry); diff --git a/mlir/examples/transform/Ch3/include/MyExtension.td b/mlir/examples/transform/Ch3/include/MyExtension.td new file mode 100644 index 0000000000000..3c5695739fa2d --- /dev/null +++ b/mlir/examples/transform/Ch3/include/MyExtension.td @@ -0,0 +1,98 @@ +//===-- MyExtension.td - Transform dialect tutorial --------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines Transform dialect extension operations used in the +// Chapter 3 of the Transform dialect tutorial. +// +//===----------------------------------------------------------------------===// + +#ifndef MY_EXTENSION +#define MY_EXTENSION + +include "MyExtensionTypes.td" +include "mlir/Dialect/Transform/IR/TransformDialect.td" +include "mlir/Dialect/Transform/IR/TransformInterfaces.td" +include "mlir/Dialect/Transform/IR/TransformTypes.td" +include "mlir/IR/OpBase.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +// Define the new operation. By convention, prefix its name with the name of the dialect +// extension, "my.". The full operation name will be further prefixed with "transform.". +def ChangeCallTargetOp : Op]> { + // Provide a brief and a full description. It is recommended that the latter describes + // the effects on the operands and how the operation processes various failure modes. + let summary = "Changes the callee of a call operation to the specified one"; + let description = [{ + For each `func.call` payload operation associated with the handle, changes its + callee to be the symbol whose name is provided as an attribute to this operation. + + Generates a silenceable failure if the operand is associated with payload operations + that are not `func.call`. + Only reads the operand. + }]; + + // The arguments include the handle to the payload operations and the attribute that + // specifies the new callee. The handle must implement TransformHandleTypeInterface. + // We use a string attribute as the symbol may not exist in the transform IR so the + // verification may fail. + let arguments = (ins + // Specify the type constraint on the input accepting only `func.call` payload + // operations. + Transform_ConcreteOpType<"func.call">:$call, + StrAttr:$new_target); + + // The results are empty as the transformation does not produce any new payload. + let results = (outs); + + // Provide nice syntax. + let assemblyFormat = "$call `,` $new_target attr-dict `:` qualified(type($call))"; + + // Declare the function implementing the interface for a single payload operation. + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::func::CallOp call, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + +// Define another transform operation. +def CallToOp : Op { + // Summary and description omitted for brevity. + + // The argument is the handle to the payload operations. + let arguments = (ins CallOpInterfaceHandle:$call); + + // The result is the handle to the payload operations produced during the + // transformation. + let results = (outs TransformHandleTypeInterface:$transformed); + + // Provide nice syntax. + let assemblyFormat = "$call attr-dict `:` functional-type(operands, results)"; + + // Declare the function implementing the interface for a single payload operation. + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::CallOpInterface call, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + +#endif // MY_EXTENSION diff --git a/mlir/examples/transform/Ch3/include/MyExtensionTypes.td b/mlir/examples/transform/Ch3/include/MyExtensionTypes.td new file mode 100644 index 0000000000000..7d745935d4783 --- /dev/null +++ b/mlir/examples/transform/Ch3/include/MyExtensionTypes.td @@ -0,0 +1,34 @@ +//===-- MyExtensionTypes.td - Transform dialect tutorial ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines Transform dialect extension types used in the +// Chapter 3 of the Transform dialect tutorial. +// +//===----------------------------------------------------------------------===// + +#ifndef MY_EXTENSIONTYPES +#define MY_EXTENSIONTYPES + +include "mlir/IR/AttrTypeBase.td" +include "mlir/Dialect/Transform/IR/TransformDialect.td" +include "mlir/Dialect/Transform/IR/TransformInterfaces.td" + +// Transform dialect allows additional types to be defined and injected. +def CallOpInterfaceHandle + : TypeDef]> { + + // The usual components of a type such as description, mnemonic and assembly format + // should be provided. + let summary = "handle to payload operations implementing CallOpInterface"; + let mnemonic = "my.call_op_interface"; + let assemblyFormat = ""; +} + +#endif // MY_EXTENSIONTYPES diff --git a/mlir/examples/transform/Ch3/lib/CMakeLists.txt b/mlir/examples/transform/Ch3/lib/CMakeLists.txt new file mode 100644 index 0000000000000..a879c87dd39a7 --- /dev/null +++ b/mlir/examples/transform/Ch3/lib/CMakeLists.txt @@ -0,0 +1,21 @@ +add_mlir_library( + # Library called MyExtension. + MyExtensionCh3 + + # Built from the following source files. + MyExtension.cpp + + # Make includes visible without top-level path. + ADDITIONAL_HEADER_DIRS + ${PROJECT_SOURCE_DIR}/examples/transform/Ch3/include + + # Make sure ODS declaration and definitions are generated before compiling this. + DEPENDS + MyExtensionCh3IncGen + + # Link in the transform dialect, an all generated dialects. + LINK_LIBS PUBLIC + MLIRTransformDialect + MLIRFuncDialect + MLIRSCFDialect +) diff --git a/mlir/examples/transform/Ch3/lib/MyExtension.cpp b/mlir/examples/transform/Ch3/lib/MyExtension.cpp new file mode 100644 index 0000000000000..41b3ffef1b498 --- /dev/null +++ b/mlir/examples/transform/Ch3/lib/MyExtension.cpp @@ -0,0 +1,218 @@ +//===-- MyExtension.cpp - Transform dialect tutorial ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines Transform dialect extension operations used in the +// Chapter 3 of the Transform dialect tutorial. +// +//===----------------------------------------------------------------------===// + +#include "MyExtension.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/IR/DialectImplementation.h" +#include "mlir/Interfaces/CallInterfaces.h" +#include "llvm/ADT/TypeSwitch.h" + +#define GET_TYPEDEF_CLASSES +#include "MyExtensionTypes.cpp.inc" + +#define GET_OP_CLASSES +#include "MyExtension.cpp.inc" + +//===---------------------------------------------------------------------===// +// MyExtension +//===---------------------------------------------------------------------===// + +// Define a new transform dialect extension. This uses the CRTP idiom to +// identify extensions. +class MyExtension + : public ::mlir::transform::TransformDialectExtension { +public: + // The extension must derive the base constructor. + using Base::Base; + + // This function initializes the extension, similarly to `initialize` in + // dialect definitions. List individual operations and dependent dialects + // here. + void init(); +}; + +void MyExtension::init() { + // Similarly to dialects, an extension can declare a dependent dialect. This + // dialect will be loaded along with the extension and, therefore, along with + // the Transform dialect. Only declare as dependent the dialects that contain + // the attributes or types used by transform operations. Do NOT declare as + // dependent the dialects produced during the transformation. + // declareDependentDialect(); + + // When transformations are applied, they may produce new operations from + // previously unloaded dialects. Typically, a pass would need to declare + // itself dependent on the dialects containing such new operations. To avoid + // confusion with the dialects the extension itself depends on, the Transform + // dialects differentiates between: + // - dependent dialects, which are used by the transform operations, and + // - generated dialects, which contain the entities (attributes, operations, + // types) that may be produced by applying the transformation even when + // not present in the original payload IR. + // In the following chapter, we will be add operations that generate function + // calls and structured control flow operations, so let's declare the + // corresponding dialects as generated. + declareGeneratedDialect<::mlir::scf::SCFDialect>(); + declareGeneratedDialect<::mlir::func::FuncDialect>(); + + // Register the additional transform dialect types with the dialect. List all + // types generated from ODS. + registerTypes< +#define GET_TYPEDEF_LIST +#include "MyExtensionTypes.cpp.inc" + >(); + + // ODS generates these helpers for type printing and parsing, but the + // Transform dialect provides its own support for types supplied by the + // extension. Reference these functions to avoid a compiler warning. + (void)generatedTypeParser; + (void)generatedTypePrinter; + + // Finally, we register the additional transform operations with the dialect. + // List all operations generated from ODS. This call will perform additional + // checks that the operations implement the transform and memory effect + // interfaces required by the dialect interpreter and assert if they do not. + registerTransformOps< +#define GET_OP_LIST +#include "MyExtension.cpp.inc" + >(); +} + +//===---------------------------------------------------------------------===// +// ChangeCallTargetOp +//===---------------------------------------------------------------------===// + +static void updateCallee(mlir::func::CallOp call, llvm::StringRef newTarget) { + call.setCallee(newTarget); +} + +// Implementation of our transform dialect operation. +// This operation returns a tri-state result that can be one of: +// - success when the transformation succeeded; +// - definite failure when the transformation failed in such a way that +// following +// transformations are impossible or undesirable, typically it could have left +// payload IR in an invalid state; it is expected that a diagnostic is emitted +// immediately before returning the definite error; +// - silenceable failure when the transformation failed but following +// transformations +// are still applicable, typically this means a precondition for the +// transformation is not satisfied and the payload IR has not been modified. +// The silenceable failure additionally carries a Diagnostic that can be emitted +// to the user. +::mlir::DiagnosedSilenceableFailure +mlir::transform::ChangeCallTargetOp::applyToOne( + // The single payload operation to which the transformation is applied. + ::mlir::func::CallOp call, + // The payload IR entities that will be appended to lists associated with + // the results of this transform operation. This list contains one entry per + // result. + ::mlir::transform::ApplyToEachResultList &results, + // The transform application state. This object can be used to query the + // current associations between transform IR values and payload IR entities. + // It can also carry additional user-defined state. + ::mlir::transform::TransformState &state) { + + // Dispatch to the actual transformation. + updateCallee(call, getNewTarget()); + + // If everything went well, return success. + return DiagnosedSilenceableFailure::success(); +} + +void mlir::transform::ChangeCallTargetOp::getEffects( + ::llvm::SmallVectorImpl<::mlir::MemoryEffects::EffectInstance> &effects) { + // Indicate that the `call` handle is only read by this operation because the + // associated operation is not erased but rather modified in-place, so the + // reference to it remains valid. + onlyReadsHandle(getCall(), effects); + + // Indicate that the payload is modified by this operation. + modifiesPayload(effects); +} + +//===---------------------------------------------------------------------===// +// CallToOp +//===---------------------------------------------------------------------===// + +static mlir::Operation *replaceCallWithOp(mlir::CallOpInterface call) { + // Construct an operation from an unregistered dialect. This is discouraged + // and is only used here for brevity of the overall example. + mlir::OperationState state(call.getLoc(), "my.mm4"); + state.types.assign(call->result_type_begin(), call->result_type_end()); + state.operands.assign(call->operand_begin(), call->operand_end()); + + mlir::OpBuilder builder(call); + mlir::Operation *replacement = builder.create(state); + call->replaceAllUsesWith(replacement->getResults()); + call->erase(); + return replacement; +} + +// See above for the signature description. +mlir::DiagnosedSilenceableFailure mlir::transform::CallToOp::applyToOne( + mlir::CallOpInterface call, mlir::transform::ApplyToEachResultList &results, + mlir::transform::TransformState &state) { + + // Dispatch to the actual transformation. + Operation *replacement = replaceCallWithOp(call); + + // Associate the payload operation produced by the rewrite with the result + // handle of this transform operation. + results.push_back(replacement); + + // If everything went well, return success. + return DiagnosedSilenceableFailure::success(); +} + +//===---------------------------------------------------------------------===// +// CallOpInterfaceHandleType +//===---------------------------------------------------------------------===// + +// The interface declares this method to verify constraints this type has on +// payload operations. It returns the now familiar tri-state result. +mlir::DiagnosedSilenceableFailure +mlir::transform::CallOpInterfaceHandleType::checkPayload( + // Location at which diagnostics should be emitted. + mlir::Location loc, + // List of payload operations that are about to be associated with the + // handle that has this type. + llvm::ArrayRef payload) const { + + // All payload operations are expected to implement CallOpInterface, check + // this. + for (Operation *op : payload) { + if (llvm::isa(op)) + continue; + + // By convention, these verifiers always emit a silenceable failure since + // they are checking a precondition. + DiagnosedSilenceableFailure diag = + emitSilenceableError(loc) + << "expected the payload operation to implement CallOpInterface"; + diag.attachNote(op->getLoc()) << "offending operation"; + return diag; + } + + // If everything is okay, return success. + return DiagnosedSilenceableFailure::success(); +} + +//===---------------------------------------------------------------------===// +// Extension registration +//===---------------------------------------------------------------------===// + +void registerMyExtension(::mlir::DialectRegistry ®istry) { + registry.addExtensions(); +} diff --git a/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp b/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp new file mode 100644 index 0000000000000..d0da0c803b77f --- /dev/null +++ b/mlir/examples/transform/Ch3/transform-opt/transform-opt.cpp @@ -0,0 +1,61 @@ +//===-- transform-opt.cpp - Transform dialect tutorial entry point --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is the top-level file for the Transform dialect tutorial chapter 2. +// +//===----------------------------------------------------------------------===// + +#include "MyExtension.h" + +#include "mlir/IR/DialectRegistry.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/InitAllDialects.h" +#include "mlir/Tools/mlir-opt/MlirOptMain.h" +#include "mlir/Transforms/Passes.h" +#include + +// Forward declarations of test passes that used in this chapter for +// illustrative purposes. Test passes are not directly exposed for use in +// binaries other than mlir-opt, which is too big to serve as an example. +namespace mlir::test { +void registerTestTransformDialectEraseSchedulePass(); +void registerTestTransformDialectInterpreterPass(); +} // namespace mlir::test + +namespace test { +void registerTestTransformDialectExtension(mlir::DialectRegistry &); +} // namespace test + +int main(int argc, char **argv) { + // Register all "core" dialects and our transform dialect extension. + mlir::DialectRegistry registry; + mlir::registerAllDialects(registry); + registerMyExtension(registry); + + // Register a handful of cleanup passes that we can run to make the output IR + // look nicer. + mlir::registerCanonicalizerPass(); + mlir::registerCSEPass(); + mlir::registerSymbolDCEPass(); + + // Register the test passes. +#ifdef MLIR_INCLUDE_TESTS + mlir::test::registerTestTransformDialectEraseSchedulePass(); + mlir::test::registerTestTransformDialectInterpreterPass(); + test::registerTestTransformDialectExtension(registry); +#else + llvm::errs() << "warning: MLIR built without test passes, interpreter " + "testing will not be available\n"; +#endif // MLIR_INCLUDE_TESTS + + // Delegate to the MLIR utility for parsing and pass management. + return mlir::MlirOptMain(argc, argv, "transform-opt-ch3", registry) + .succeeded() + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/mlir/examples/transform/README.md b/mlir/examples/transform/README.md new file mode 100644 index 0000000000000..d528098195223 --- /dev/null +++ b/mlir/examples/transform/README.md @@ -0,0 +1,4 @@ +Transform Dialect Tutorial is available at +https://mlir.llvm.org/docs/Tutorials/Transform. + +Test files are located under `mlir/test/Examples/Transform`. diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 048291bd5b8d8..dd8208b674519 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -125,6 +125,8 @@ if(LLVM_BUILD_EXAMPLES) toyc-ch3 toyc-ch4 toyc-ch5 + transform-opt-ch2 + transform-opt-ch3 ) if(MLIR_ENABLE_EXECUTION_ENGINE) list(APPEND MLIR_TEST_DEPENDS diff --git a/mlir/test/Examples/transform/Ch1/invalidation-1.mlir b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir new file mode 100644 index 0000000000000..4270e1eaa44c1 --- /dev/null +++ b/mlir/test/Examples/transform/Ch1/invalidation-1.mlir @@ -0,0 +1,98 @@ +// RUN: mlir-opt %s \ +// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ +// RUN: bind-first-extra-to-ops=linalg.matmul \ +// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ +// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" \ +// RUN: --split-input-file --verify-diagnostics + +// ****************************** IMPORTANT NOTE ****************************** +// +// If you are changing this file, you may also need to change +// mlir/docs/Tutorials/Transform accordingly. +// +// **************************************************************************** + +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + // expected-note @below {{handle to invalidated ops}} + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // The actual tiling transformation takes tile sizes as attributes. + // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}} + %loop, %tiled = transform.structured.tile_to_forall_op %arg1 tile_sizes [4, 32] + : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) + + // This is trying to use an invalidated handle leading to undefined behavior. + // expected-error @below {{uses a handle invalidated by a previously executed transform op}} + transform.test_print_remark_at_operand %arg1, "remark" : !transform.op<"linalg.matmul"> + transform.yield +} + +// Original function to optimize. +func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, + %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>) + -> tensor<512x512xf32> { + // Matrix-matrix multiplication. + // expected-note @below {{payload op}} + %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise addition. + %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise max with 0 (ReLU). + %c0f = arith.constant 0.0 : f32 + %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%biased, %c0f : tensor<512x512xf32>, f32) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + func.return %relued : tensor<512x512xf32> +} + +// ----- + +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // We can cast one type to another as long as operations are compatible + // with both types. This creates "aliasing" handles. + // expected-note @below {{handle to invalidated ops}} + %casted = transform.cast %arg1 : !transform.op<"linalg.matmul"> to + !transform.any_op + + // The actual tiling transformation takes tile sizes as attributes. + // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}} + %loop, %tiled = transform.structured.tile_to_forall_op %arg1 tile_sizes [4, 32] + : (!transform.op<"linalg.matmul">) -> (!transform.any_op, !transform.any_op) + + // Consuming an operand invalidates the consumed handle and any other handle that is + // associated with the same payload operations, or payload operations nested in them. + // expected-error @below {{uses a handle invalidated by a previously executed transform op}} + transform.test_print_remark_at_operand %casted, "remark" + : !transform.any_op + transform.yield +} + +// Original function to optimize. +func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, + %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>) + -> tensor<512x512xf32> { + // Matrix-matrix multiplication. + // expected-note @below {{payload op}} + %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise addition. + %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise max with 0 (ReLU). + %c0f = arith.constant 0.0 : f32 + %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%biased, %c0f : tensor<512x512xf32>, f32) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + func.return %relued : tensor<512x512xf32> +} diff --git a/mlir/test/Examples/transform/Ch1/invalidation-2.mlir b/mlir/test/Examples/transform/Ch1/invalidation-2.mlir new file mode 100644 index 0000000000000..dafdb3aca57f8 --- /dev/null +++ b/mlir/test/Examples/transform/Ch1/invalidation-2.mlir @@ -0,0 +1,102 @@ +// RUN: mlir-opt %s \ +// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ +// RUN: bind-first-extra-to-ops=linalg.matmul \ +// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ +// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" \ +// RUN: --split-input-file --verify-diagnostics + +// ****************************** IMPORTANT NOTE ****************************** +// +// If you are changing this file, you may also need to change +// mlir/docs/Tutorials/Transform accordingly. +// +// **************************************************************************** + +// Original function to optimize. +func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, + %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>) + -> tensor<512x512xf32> { + // Matrix-matrix multiplication. + + // expected-note @below {{nested payload op}} + %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise addition. + + // expected-note @below {{ancestor payload op}} + %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise max with 0 (ReLU). + %c0f = arith.constant 0.0 : f32 + %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%biased, %c0f : tensor<512x512xf32>, f32) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + func.return %relued : tensor<512x512xf32> +} + +// Declaration of the "microkernel" function that we will be targeting. +func.func private @microkernel( + %lhs: tensor<4x512xf32>, + %rhs: tensor<512x4xf32>, + %bias: tensor<4x4xf32>, + %init: tensor<4x4xf32>, + %output: tensor<4x4xf32>) -> tensor<4x4xf32> + +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %loop, %tiled = transform.structured.tile_to_forall_op %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 + : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the transform + // dialect. Otherwise, it is difficult to differentiate "add" and "max", both + // of which having the same kind. + %loop_second, %tiled_second = transform.structured.tile_to_forall_op %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2, %loop_second_2 = + transform.structured.fuse_into_containing_op %matmul_fused into %loop_second + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Since outlining is currently only implemented for region-holding operations + // such as loops, use tiling to size 1 to materialize the outer loop that is + // going to be outlined. + %loop_third, %_0 = transform.structured.tile_to_forall_op %tiled_second tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + // expected-note @below {{handle to invalidated ops}} + %f, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // expected-note @below {{invalidated by this transform op that consumes its operand #0 and invalidates all handles to payload IR entities associated with this operand and entities nested in them}} + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) + + // expected-error @below {{uses a handle invalidated by a previously executed transform op}} + transform.test_print_remark_at_operand %f, "fused" : !transform.any_op + + transform.yield +} diff --git a/mlir/test/Examples/transform/Ch1/sequence.mlir b/mlir/test/Examples/transform/Ch1/sequence.mlir new file mode 100644 index 0000000000000..ff2fc2a70d5c9 --- /dev/null +++ b/mlir/test/Examples/transform/Ch1/sequence.mlir @@ -0,0 +1,111 @@ +// RUN: mlir-opt %s \ +// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ +// RUN: bind-first-extra-to-ops=linalg.matmul \ +// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ +// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" |\ +// RUN: FileCheck %s + +// ****************************** IMPORTANT NOTE ****************************** +// +// If you are changing this file, you may also need to change +// mlir/docs/Tutorials/Transform accordingly. +// +// **************************************************************************** + +// Original function to optimize. +func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, + %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>) + -> tensor<512x512xf32> { + // Matrix-matrix multiplication. + %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise addition. + %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise max with 0 (ReLU). + %c0f = arith.constant 0.0 : f32 + %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%biased, %c0f : tensor<512x512xf32>, f32) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + func.return %relued : tensor<512x512xf32> +} + +// CHECK: func @outlined +// CHECK: linalg.matmul +// CHECK: linalg.elemwise_binary {fun = #linalg.binary_fn} + +// CHECK-LABEL: func @fc_relu +// CHECK: scf.forall +// CHECK: scf.forall +// CHECK: %[[SLICE4:.+]] = tensor.extract_slice +// CHECK: %[[SLICE5:.+]] = tensor.extract_slice +// CHECK: %[[SLICE6:.+]] = tensor.extract_slice +// CHECK: %[[SLICE7:.+]] = tensor.extract_slice +// CHECK: %[[SLICE8:.+]] = tensor.extract_slice +// CHECK: func.call @outlined(%[[SLICE4]], %[[SLICE5]], %[[SLICE6]], %[[SLICE7]], %[[SLICE8]]) +// CHECK-NOT: linalg.matmul +// CHECK-NOT: linalg.elemwise_binary +// CHECK: scf.forall.in_parallel +// CHECK: linalg.elemwise_binary {fun = #linalg.binary_fn} +// CHECK: scf.forall.in_parallel + +// Declaration of the "microkernel" function that we will be targeting. +func.func private @microkernel( + %lhs: tensor<4x512xf32>, + %rhs: tensor<512x4xf32>, + %bias: tensor<4x4xf32>, + %init: tensor<4x4xf32>, + %output: tensor<4x4xf32>) -> tensor<4x4xf32> + +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %loop, %tiled = transform.structured.tile_to_forall_op %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 + : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the transform + // dialect. Otherwise, it is difficult to differentiate "add" and "max", both + // of which having the same kind. + %loop_second, %tiled_second = transform.structured.tile_to_forall_op %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2, %loop_second_2 = + transform.structured.fuse_into_containing_op %matmul_fused into %loop_second + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Since outlining is currently only implemented for region-holding operations + // such as loops, use tiling to size 1 to materialize the outer loop that is + // going to be outlined. + %loop_third, %_0 = transform.structured.tile_to_forall_op %tiled_second tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) + + transform.yield +} diff --git a/mlir/test/Examples/transform/Ch2/invalid.mlir b/mlir/test/Examples/transform/Ch2/invalid.mlir new file mode 100644 index 0000000000000..ad536832d9c52 --- /dev/null +++ b/mlir/test/Examples/transform/Ch2/invalid.mlir @@ -0,0 +1,11 @@ +// RUN: transform-opt-ch2 %s --test-transform-dialect-interpreter --split-input-file --verify-diagnostics + +// expected-note @below {{offending payload}} +module { + transform.sequence failures(propagate) { + ^bb0(%arg0: !transform.any_op): + // expected-error @below {{only applies to func.call payloads}} + transform.my.change_call_target %arg0, "updated" : !transform.any_op + yield + } +} diff --git a/mlir/test/Examples/transform/Ch2/ops.mlir b/mlir/test/Examples/transform/Ch2/ops.mlir new file mode 100644 index 0000000000000..d66f89b9ec8dd --- /dev/null +++ b/mlir/test/Examples/transform/Ch2/ops.mlir @@ -0,0 +1,26 @@ +// RUN: transform-opt-ch2 %s --test-transform-dialect-interpreter | FileCheck %s + +// ****************************** IMPORTANT NOTE ****************************** +// +// If you are changing this file, you may also need to change +// mlir/docs/Tutorials/Transform accordingly. +// +// **************************************************************************** + +func.func private @orig() +func.func private @updated() + +// CHECK-LABEL: func @test +func.func @test() { + // CHECK: call @updated + call @orig() : () -> () + return +} + +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op): + %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.any_op + transform.my.change_call_target %call, "updated" : !transform.any_op + transform.yield +} diff --git a/mlir/test/Examples/transform/Ch2/sequence.mlir b/mlir/test/Examples/transform/Ch2/sequence.mlir new file mode 100644 index 0000000000000..280231d733f3d --- /dev/null +++ b/mlir/test/Examples/transform/Ch2/sequence.mlir @@ -0,0 +1,110 @@ +// RUN: transform-opt-ch2 %s \ +// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ +// RUN: bind-first-extra-to-ops=linalg.matmul \ +// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ +// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" |\ +// RUN: FileCheck %s + +// ****************************** IMPORTANT NOTE ****************************** +// +// If you are changing this file, you may also need to change +// mlir/docs/Tutorials/Transform accordingly. +// +// **************************************************************************** + +// Original function to optimize. +func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, + %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>) + -> tensor<512x512xf32> { + // Matrix-matrix multiplication. + %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise addition. + %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise max with 0 (ReLU). + %c0f = arith.constant 0.0 : f32 + %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%biased, %c0f : tensor<512x512xf32>, f32) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + func.return %relued : tensor<512x512xf32> +} + +// CHECK-LABEL: func @fc_relu +// CHECK: scf.forall +// CHECK: scf.forall +// CHECK: %[[SLICE4:.+]] = tensor.extract_slice +// CHECK: %[[SLICE5:.+]] = tensor.extract_slice +// CHECK: %[[SLICE6:.+]] = tensor.extract_slice +// CHECK: %[[SLICE7:.+]] = tensor.extract_slice +// CHECK: %[[SLICE8:.+]] = tensor.extract_slice +// CHECK: func.call @microkernel(%[[SLICE4]], %[[SLICE5]], %[[SLICE6]], %[[SLICE7]], %[[SLICE8]]) +// CHECK-NOT: linalg.matmul +// CHECK-NOT: linalg.elemwise_binary +// CHECK: scf.forall.in_parallel +// CHECK: linalg.elemwise_binary {fun = #linalg.binary_fn} +// CHECK: scf.forall.in_parallel + +// Declaration of the "microkernel" function that we will be targeting. +func.func private @microkernel( + %lhs: tensor<4x512xf32>, + %rhs: tensor<512x4xf32>, + %bias: tensor<4x4xf32>, + %init: tensor<4x4xf32>, + %output: tensor<4x4xf32>) -> tensor<4x4xf32> + +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %loop, %tiled = transform.structured.tile_to_forall_op %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 + : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the transform + // dialect. Otherwise, it is difficult to differentiate "add" and "max", both + // of which having the same kind. + %loop_second, %tiled_second = transform.structured.tile_to_forall_op %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2, %loop_second_2 = + transform.structured.fuse_into_containing_op %matmul_fused into %loop_second + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Since outlining is currently only implemented for region-holding operations + // such as loops, use tiling to size 1 to materialize the outer loop that is + // going to be outlined. + %loop_third, %_0 = transform.structured.tile_to_forall_op %tiled_second tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Rewrite the call target. + transform.my.change_call_target %call, "microkernel" : !transform.any_op + + transform.yield +} diff --git a/mlir/test/Examples/transform/Ch3/invalid.mlir b/mlir/test/Examples/transform/Ch3/invalid.mlir new file mode 100644 index 0000000000000..222629504fea6 --- /dev/null +++ b/mlir/test/Examples/transform/Ch3/invalid.mlir @@ -0,0 +1,10 @@ +// RUN: transform-opt-ch3 %s --test-transform-dialect-interpreter --split-input-file --verify-diagnostics + +// expected-note @below {{offending operation}} +module { + transform.sequence failures(suppress) { + // expected-error @below {{expected the payload operation to implement CallOpInterface}} + ^bb0(%arg0: !transform.my.call_op_interface): + yield + } +} diff --git a/mlir/test/Examples/transform/Ch3/ops.mlir b/mlir/test/Examples/transform/Ch3/ops.mlir new file mode 100644 index 0000000000000..f4170b8918bfe --- /dev/null +++ b/mlir/test/Examples/transform/Ch3/ops.mlir @@ -0,0 +1,46 @@ +// RUN: transform-opt-ch3 %s --test-transform-dialect-interpreter \ +// RUN: --allow-unregistered-dialect --split-input-file | FileCheck %s + +// ****************************** IMPORTANT NOTE ****************************** +// +// If you are changing this file, you may also need to change +// mlir/docs/Tutorials/Transform accordingly. +// +// **************************************************************************** + +func.func private @orig() +func.func private @updated() + +// CHECK-LABEL: func @test1 +func.func @test1() { + // CHECK: call @updated + call @orig() : () -> () + return +} + +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op): + %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.op<"func.call"> + // CHECK: transform.my.change_call_target %{{.*}}, "updated" : !transform.op<"func.call"> + transform.my.change_call_target %call, "updated" : !transform.op<"func.call"> + transform.yield +} + +// ----- + +func.func private @orig() + +// CHECK-LABEL: func @test2 +func.func @test2() { + // CHECK: "my.mm4" + call @orig() : () -> () + return +} + +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op): + %call = transform.structured.match ops{["func.call"]} in %arg0 : (!transform.any_op) -> !transform.my.call_op_interface + // CHECK: transform.my.call_to_op %{{.*}} : (!transform.my.call_op_interface) -> !transform.any_op + transform.my.call_to_op %call : (!transform.my.call_op_interface) -> !transform.any_op + transform.yield +} diff --git a/mlir/test/Examples/transform/Ch3/sequence.mlir b/mlir/test/Examples/transform/Ch3/sequence.mlir new file mode 100644 index 0000000000000..94f396941bfc2 --- /dev/null +++ b/mlir/test/Examples/transform/Ch3/sequence.mlir @@ -0,0 +1,110 @@ +// RUN: transform-opt-ch2 %s \ +// RUN: --pass-pipeline="builtin.module(test-transform-dialect-interpreter{ \ +// RUN: bind-first-extra-to-ops=linalg.matmul \ +// RUN: bind-second-extra-to-ops=linalg.elemwise_binary \ +// RUN: enable-expensive-checks},canonicalize,cse,symbol-dce)" |\ +// RUN: FileCheck %s + +// ****************************** IMPORTANT NOTE ****************************** +// +// If you are changing this file, you may also need to change +// mlir/docs/Tutorials/Transform accordingly. +// +// **************************************************************************** + +// Original function to optimize. +func.func @fc_relu(%lhs: tensor<512x512xf32>, %rhs: tensor<512x512xf32>, + %bias: tensor<512x512xf32>, %output: tensor<512x512xf32>) + -> tensor<512x512xf32> { + // Matrix-matrix multiplication. + %matmul = linalg.matmul ins(%lhs, %rhs: tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output: tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise addition. + %biased = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%matmul, %bias : tensor<512x512xf32>, tensor<512x512xf32>) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + + // Elementwise max with 0 (ReLU). + %c0f = arith.constant 0.0 : f32 + %relued = linalg.elemwise_binary { fun = #linalg.binary_fn } + ins(%biased, %c0f : tensor<512x512xf32>, f32) + outs(%output : tensor<512x512xf32>) -> tensor<512x512xf32> + func.return %relued : tensor<512x512xf32> +} + +// CHECK-LABEL: func @fc_relu +// CHECK: scf.forall +// CHECK: scf.forall +// CHECK: %[[SLICE4:.+]] = tensor.extract_slice +// CHECK: %[[SLICE5:.+]] = tensor.extract_slice +// CHECK: %[[SLICE6:.+]] = tensor.extract_slice +// CHECK: %[[SLICE7:.+]] = tensor.extract_slice +// CHECK: %[[SLICE8:.+]] = tensor.extract_slice +// CHECK: func.call @microkernel(%[[SLICE4]], %[[SLICE5]], %[[SLICE6]], %[[SLICE7]], %[[SLICE8]]) +// CHECK-NOT: linalg.matmul +// CHECK-NOT: linalg.elemwise_binary +// CHECK: scf.forall.in_parallel +// CHECK: linalg.elemwise_binary {fun = #linalg.binary_fn} +// CHECK: scf.forall.in_parallel + +// Declaration of the "microkernel" function that we will be targeting. +func.func private @microkernel( + %lhs: tensor<4x512xf32>, + %rhs: tensor<512x4xf32>, + %bias: tensor<4x4xf32>, + %init: tensor<4x4xf32>, + %output: tensor<4x4xf32>) -> tensor<4x4xf32> + +transform.sequence failures(propagate) { +^bb0(%arg0: !transform.any_op, + %arg1: !transform.op<"linalg.matmul">, + %arg2: !transform.op<"linalg.elemwise_binary">): + // Since the %arg2 handle is associated with both elementwise operations, + // we need to split it into two handles so we can target only the second + // elementwise operation. + %add, %max = transform.split_handle %arg2 : (!transform.op<"linalg.elemwise_binary">) + -> (!transform.any_op, !transform.any_op) + + // The actual tiling transformation takes tile sizes as attributes. It produces a + // handle to the loop generated during tiling. + %loop, %tiled = transform.structured.tile_to_forall_op %max tile_sizes [8, 32] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + + // We can now fuse the other operations into the loop. Here, we fuse + // operations one-by-one. This requires the operation that is being fused + // to define the value used within the loop, so the order of such fusions + // is important. We could also use "transform.merge_handles" to obtain + // a single handle to all operations and give it to `fuse_into_containing_op` + // that would take care of the ordering in this case. + %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 into %loop2 + : (!transform.op<"linalg.matmul">, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Tile again to get the desired size. Note that this time this tiles the + // "add" operation and fuses matmul into the loop, but doesn't affect the + // "max" operation. This illustrates the precise targeting with the transform + // dialect. Otherwise, it is difficult to differentiate "add" and "max", both + // of which having the same kind. + %loop_second, %tiled_second = transform.structured.tile_to_forall_op %add_fused tile_sizes [4, 4] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused_2, %loop_second_2 = + transform.structured.fuse_into_containing_op %matmul_fused into %loop_second + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + + // Since outlining is currently only implemented for region-holding operations + // such as loops, use tiling to size 1 to materialize the outer loop that is + // going to be outlined. + %loop_third, %_0 = transform.structured.tile_to_forall_op %tiled_second tile_sizes [1] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} + : (!transform.any_op) -> (!transform.any_op, !transform.op<"func.call">) + + // Rewrite the call target. + transform.my.change_call_target %call, "microkernel" : !transform.op<"func.call"> + + transform.yield +} diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 8f18fc6015326..3a8bdbfcec280 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -128,6 +128,8 @@ def add_runtime(name): ToolSubst("toyc-ch5", unresolved="ignore"), ToolSubst("toyc-ch6", unresolved="ignore"), ToolSubst("toyc-ch7", unresolved="ignore"), + ToolSubst('transform-opt-ch2', unresolved='ignore'), + ToolSubst('transform-opt-ch3', unresolved='ignore'), ToolSubst("%mlir_lib_dir", config.mlir_lib_dir, unresolved="ignore"), ToolSubst("%mlir_src_dir", config.mlir_src_root, unresolved="ignore"), ] From 0e4c4c77730810db235d377d49ba5860dfa0bd8d Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Tue, 30 May 2023 08:33:13 -0700 Subject: [PATCH 091/704] [clang] Extend __is_trivially_equality_comparable to check for hidden friends This allows types to be considered trivially equality comparable if a defaulted hidden friend is used. Reviewed By: erichkeane Spies: cfe-commits Differential Revision: https://reviews.llvm.org/D151623 --- clang/lib/AST/Type.cpp | 21 +++- clang/test/SemaCXX/type-traits.cpp | 166 +++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+), 5 deletions(-) diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 508965fc38e55..bde88653417d9 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -18,6 +18,7 @@ #include "clang/AST/Decl.h" #include "clang/AST/DeclBase.h" #include "clang/AST/DeclCXX.h" +#include "clang/AST/DeclFriend.h" #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/DependenceFlags.h" @@ -2640,11 +2641,21 @@ HasNonDeletedDefaultedEqualityComparison(const CXXRecordDecl *Decl) { if (Decl->isUnion()) return false; - if (llvm::none_of(Decl->methods(), [](const CXXMethodDecl *MemberFunction) { - return MemberFunction->isOverloadedOperator() && - MemberFunction->getOverloadedOperator() == - OverloadedOperatorKind::OO_EqualEqual && - MemberFunction->isDefaulted(); + auto IsDefaultedOperatorEqualEqual = [&](const FunctionDecl *Function) { + return Function->getOverloadedOperator() == + OverloadedOperatorKind::OO_EqualEqual && + Function->isDefaulted() && Function->getNumParams() > 0 && + (Function->getParamDecl(0)->getType()->isReferenceType() || + Decl->isTriviallyCopyable()); + }; + + if (llvm::none_of(Decl->methods(), IsDefaultedOperatorEqualEqual) && + llvm::none_of(Decl->friends(), [&](const FriendDecl *Friend) { + if (NamedDecl *ND = Friend->getFriendDecl()) { + return ND->isFunctionOrFunctionTemplate() && + IsDefaultedOperatorEqualEqual(ND->getAsFunction()); + } + return false; })) return false; diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp index 75f172d1c3452..d5388d4eb89be 100644 --- a/clang/test/SemaCXX/type-traits.cpp +++ b/clang/test/SemaCXX/type-traits.cpp @@ -3270,6 +3270,172 @@ struct NotTriviallyEqualityComparableHasEnum { }; static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableHasEnum)); +namespace hidden_friend { + +struct TriviallyEqualityComparable { + int i; + int j; + + void func(); + bool operator==(int) const { return false; } + + friend bool operator==(const TriviallyEqualityComparable&, const TriviallyEqualityComparable&) = default; +}; +static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparable), ""); + +struct TriviallyEqualityComparableNonTriviallyCopyable { + TriviallyEqualityComparableNonTriviallyCopyable(const TriviallyEqualityComparableNonTriviallyCopyable&); + ~TriviallyEqualityComparableNonTriviallyCopyable(); + friend bool operator==(const TriviallyEqualityComparableNonTriviallyCopyable&, const TriviallyEqualityComparableNonTriviallyCopyable&) = default; + int i; +}; +static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparableNonTriviallyCopyable)); + +struct NotTriviallyEqualityComparableHasPadding { + short i; + int j; + + friend bool operator==(const NotTriviallyEqualityComparableHasPadding&, const NotTriviallyEqualityComparableHasPadding&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableHasPadding), ""); + +struct NotTriviallyEqualityComparableHasFloat { + float i; + int j; + + friend bool operator==(const NotTriviallyEqualityComparableHasFloat&, const NotTriviallyEqualityComparableHasFloat&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableHasFloat), ""); + +struct NotTriviallyEqualityComparableHasTailPadding { + int i; + char j; + + friend bool operator==(const NotTriviallyEqualityComparableHasTailPadding&, const NotTriviallyEqualityComparableHasTailPadding&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableHasTailPadding), ""); + +struct NotTriviallyEqualityComparableBase : NotTriviallyEqualityComparableHasTailPadding { + char j; + + friend bool operator==(const NotTriviallyEqualityComparableBase&, const NotTriviallyEqualityComparableBase&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableBase), ""); + +class TriviallyEqualityComparablePaddedOutBase { + int i; + char c; + +public: + friend bool operator==(const TriviallyEqualityComparablePaddedOutBase&, const TriviallyEqualityComparablePaddedOutBase&) = default; +}; +static_assert(!__is_trivially_equality_comparable(TriviallyEqualityComparablePaddedOutBase), ""); + +struct TriviallyEqualityComparablePaddedOut : TriviallyEqualityComparablePaddedOutBase { + char j[3]; + + friend bool operator==(const TriviallyEqualityComparablePaddedOut&, const TriviallyEqualityComparablePaddedOut&) = default; +}; +static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparablePaddedOut), ""); + +struct TriviallyEqualityComparable1 { + char i; + + friend bool operator==(const TriviallyEqualityComparable1&, const TriviallyEqualityComparable1&) = default; +}; +static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparable1)); + +struct TriviallyEqualityComparable2 { + int i; + + friend bool operator==(const TriviallyEqualityComparable2&, const TriviallyEqualityComparable2&) = default; +}; +static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparable2)); + +struct NotTriviallyEqualityComparableTriviallyEqualityComparableBases + : TriviallyEqualityComparable1, TriviallyEqualityComparable2 { + friend bool operator==(const NotTriviallyEqualityComparableTriviallyEqualityComparableBases&, const NotTriviallyEqualityComparableTriviallyEqualityComparableBases&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableTriviallyEqualityComparableBases)); + +struct NotTriviallyEqualityComparableBitfield { + int i : 1; + + friend bool operator==(const NotTriviallyEqualityComparableBitfield&, const NotTriviallyEqualityComparableBitfield&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableBitfield)); + +// TODO: This is trivially equality comparable +struct NotTriviallyEqualityComparableBitfieldFilled { + char i : __CHAR_BIT__; + + friend bool operator==(const NotTriviallyEqualityComparableBitfieldFilled&, const NotTriviallyEqualityComparableBitfieldFilled&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableBitfield)); + +union U { + int i; + + friend bool operator==(const U&, const U&) = default; +}; + +struct NotTriviallyEqualityComparableImplicitlyDeletedOperatorByUnion { + U u; + + friend bool operator==(const NotTriviallyEqualityComparableImplicitlyDeletedOperatorByUnion&, const NotTriviallyEqualityComparableImplicitlyDeletedOperatorByUnion&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableImplicitlyDeletedOperatorByUnion)); + +struct NotTriviallyEqualityComparableExplicitlyDeleted { + int i; + + friend bool operator==(const NotTriviallyEqualityComparableExplicitlyDeleted&, const NotTriviallyEqualityComparableExplicitlyDeleted&) = delete; +}; + +struct NotTriviallyEqualityComparableImplicitlyDeletedOperatorByStruct { + NotTriviallyEqualityComparableExplicitlyDeleted u; + + friend bool operator==(const NotTriviallyEqualityComparableImplicitlyDeletedOperatorByStruct&, const NotTriviallyEqualityComparableImplicitlyDeletedOperatorByStruct&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableImplicitlyDeletedOperatorByStruct)); + +struct NotTriviallyEqualityComparableHasReferenceMember { + int& i; + + friend bool operator==(const NotTriviallyEqualityComparableHasReferenceMember&, const NotTriviallyEqualityComparableHasReferenceMember&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableHasReferenceMember)); + +enum E { + a, + b +}; +bool operator==(E, E) { return false; } +static_assert(!__is_trivially_equality_comparable(E)); + +struct NotTriviallyEqualityComparableHasEnum { + E e; + friend bool operator==(const NotTriviallyEqualityComparableHasEnum&, const NotTriviallyEqualityComparableHasEnum&) = default; +}; +static_assert(!__is_trivially_equality_comparable(NotTriviallyEqualityComparableHasEnum)); + +struct NonTriviallyEqualityComparableValueComparisonNonTriviallyCopyable { + int i; + NonTriviallyEqualityComparableValueComparisonNonTriviallyCopyable(const NonTriviallyEqualityComparableValueComparisonNonTriviallyCopyable&); + + friend bool operator==(NonTriviallyEqualityComparableValueComparisonNonTriviallyCopyable, NonTriviallyEqualityComparableValueComparisonNonTriviallyCopyable) = default; +}; +static_assert(!__is_trivially_equality_comparable(NonTriviallyEqualityComparableValueComparisonNonTriviallyCopyable)); + +struct TriviallyEqualityComparableRefComparisonNonTriviallyCopyable { + int i; + TriviallyEqualityComparableRefComparisonNonTriviallyCopyable(const TriviallyEqualityComparableRefComparisonNonTriviallyCopyable&); + + friend bool operator==(const TriviallyEqualityComparableRefComparisonNonTriviallyCopyable&, const TriviallyEqualityComparableRefComparisonNonTriviallyCopyable&) = default; +}; +static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparableRefComparisonNonTriviallyCopyable)); +} + #endif // __cplusplus >= 202002L }; From 8098f2577efa4e0319a8252d8f8d7c382aa71986 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 30 May 2023 16:42:55 +0100 Subject: [PATCH 092/704] [LV] Use Legal::isUniform to detect uniform pointers. Update collectLoopUniforms to identify uniform pointers using Legal::isUniform. This is more powerful and brings pointer classification here in sync with setCostBasedWideningDecision which uses isUniformMemOp. The existing mis-match in reasoning can causes crashes due to D134460, which is fixed by this patch. Fixes https://github.com/llvm/llvm-project/issues/60831. Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D150991 --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++---- .../LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll | 3 --- .../test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll | 8 +++----- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0a99ccb5df373..9c5caaddb4852 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4703,7 +4703,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { if (isa(I) && I->getOperand(0) == Ptr) return false; - return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF); + return getLoadStorePointerOperand(I) == Ptr && + (isUniformDecision(I, VF) || Legal->isUniform(Ptr)); }; // Holds a list of values which are known to have at least one uniform use. @@ -4749,10 +4750,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { if (isUniformMemOpUse(&I)) addToWorklistIfAllowed(&I); - if (isVectorizedMemAccessUse(&I, Ptr)) { - assert(isUniformDecision(&I, VF) && "consistency check"); + if (isVectorizedMemAccessUse(&I, Ptr)) HasUniformUse.insert(Ptr); - } } // Add to the worklist any operands which have *only* uniform (e.g. lane 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll index cff60b10d0187..9526dd11cd6da 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll @@ -3,9 +3,6 @@ target triple = "aarch64-unknown-linux-gnu" -; REQUIRES: asserts -; XFAIL: * - ; Test cases for PR60831. define void @test_invar_gep(ptr %dst) #0 { diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll index e5d00b35f11ed..db5a7105fd8c4 100644 --- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll @@ -50,13 +50,11 @@ define i16 @test(ptr %arg, i64 %N) { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP6]], align 2, !alias.scope !0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[L_2]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[WIDE_LOAD]], i32 1 ; CHECK-NEXT: store i16 [[TMP8]], ptr [[TMP7]], align 2, !alias.scope !3, !noalias !0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[WIDE_LOAD]], i32 1 -; CHECK-NEXT: store i16 [[TMP9]], ptr [[TMP7]], align 2, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] From 5e7ac2503a1bbfa13b84f00d8e12865cd16b0164 Mon Sep 17 00:00:00 2001 From: Quinn Dawkins Date: Tue, 30 May 2023 02:18:32 -0400 Subject: [PATCH 093/704] [mlir][transform] Add op for adding attributes to payload IR The ability to add attributes to payload IR is useful functionality independent of any dialect. This is added here through `transform.annotate` by enabling attributes tied to a `TransformParamTypeInterface` (which internally refers to an Attribute) to be added to a target operation by name. The AnnotateOp does not produce a new handle as no existing handles should be affected by adding an attribute. Existing attributes on the payload with the same name will be overwritten. Differential Revision: https://reviews.llvm.org/D151689 --- .../mlir/Dialect/Transform/IR/TransformOps.td | 25 +++++++++++++ .../lib/Dialect/Transform/IR/TransformOps.cpp | 37 +++++++++++++++++++ .../Dialect/Transform/test-interpreter.mlir | 34 +++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td index 9305b6b0859e2..6036687017a55 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td @@ -101,6 +101,31 @@ def AlternativesOp : TransformDialectOp<"alternatives", let hasVerifier = 1; } +def AnnotateOp : TransformDialectOp<"annotate", + [DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods]> { + let summary = "Annotates the target operation with an attribute by name"; + let description = [{ + Adds an attribute with the given `name` to the `target` operation. An + optional `param` handle can be provided to give the attribute a specific + value, else a UnitAttr is added. A single attribute will be broadcasted to + all target operations, otherwise the attributes will be mapped 1:1 based on + the order within the handles. + + Fails silently if the length of the parameter payload does not match the length of + the target payload. Does not consume the provided handles. + }]; + + let arguments = (ins TransformHandleTypeInterface:$target, + StrAttr:$name, + Optional:$param); + let results = (outs); + + let assemblyFormat = + "$target $name attr-dict (`=` $param^)?" + "`:` type($target) (`,` type($param)^)?"; +} + def CastOp : TransformDialectOp<"cast", [TransformOpInterface, TransformEachOpTrait, DeclareOpInterfaceMethods, diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp index a3b55a45dd96e..5f18d9042fdf2 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp @@ -301,6 +301,43 @@ LogicalResult transform::AlternativesOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// AnnotateOp +//===----------------------------------------------------------------------===// + +DiagnosedSilenceableFailure +transform::AnnotateOp::apply(transform::TransformResults &results, + transform::TransformState &state) { + SmallVector targets = + llvm::to_vector(state.getPayloadOps(getTarget())); + + Attribute attr = UnitAttr::get(getContext()); + if (auto paramH = getParam()) { + ArrayRef params = state.getParams(paramH); + if (params.size() != 1) { + if (targets.size() != params.size()) { + return emitSilenceableError() + << "parameter and target have different payload lengths (" + << params.size() << " vs " << targets.size() << ")"; + } + for (auto &&[target, attr] : llvm::zip_equal(targets, params)) + target->setAttr(getName(), attr); + return DiagnosedSilenceableFailure::success(); + } + attr = params[0]; + } + for (auto target : targets) + target->setAttr(getName(), attr); + return DiagnosedSilenceableFailure::success(); +} + +void transform::AnnotateOp::getEffects( + SmallVectorImpl &effects) { + onlyReadsHandle(getTarget(), effects); + onlyReadsHandle(getParam(), effects); + modifiesPayload(effects); +} + //===----------------------------------------------------------------------===// // CastOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Transform/test-interpreter.mlir b/mlir/test/Dialect/Transform/test-interpreter.mlir index a885c89af0317..932b2cb011350 100644 --- a/mlir/test/Dialect/Transform/test-interpreter.mlir +++ b/mlir/test/Dialect/Transform/test-interpreter.mlir @@ -1620,3 +1620,37 @@ transform.sequence failures(propagate) { // expected-remark @below {{2}} test_print_number_of_associated_payload_ir_ops %0 : !transform.any_op } + + +// ----- + +// CHECK-LABEL: func @test_annotation() +// CHECK-NEXT: "test.annotate_me"() +// CHECK-SAME: broadcast_attr = 2 : i64 +// CHECK-SAME: new_attr = 1 : i32 +// CHECK-SAME: unit_attr +// CHECK-NEXT: "test.annotate_me"() +// CHECK-SAME: broadcast_attr = 2 : i64 +// CHECK-SAME: existing_attr = "test" +// CHECK-SAME: new_attr = 1 : i32 +// CHECK-SAME: unit_attr +// CHECK-NEXT: "test.annotate_me"() +// CHECK-SAME: broadcast_attr = 2 : i64 +// CHECK-SAME: new_attr = 1 : i32 +// CHECK-SAME: unit_attr +func.func @test_annotation() { + %0 = "test.annotate_me"() : () -> (i1) + %1 = "test.annotate_me"() {existing_attr = "test"} : () -> (i1) + %2 = "test.annotate_me"() {new_attr = 0} : () -> (i1) +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !transform.any_op): + %0 = transform.structured.match ops{["test.annotate_me"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.test_produce_param_with_number_of_test_ops %0 : !transform.any_op + transform.annotate %0 "new_attr" = %1 : !transform.any_op, !transform.test_dialect_param + + %2 = transform.param.constant 2 -> !transform.param + transform.annotate %0 "broadcast_attr" = %2 : !transform.any_op, !transform.param + transform.annotate %0 "unit_attr" : !transform.any_op +} From 2582b2e3ac19d3723daf6960b1edb7c0b627ff20 Mon Sep 17 00:00:00 2001 From: Lukas Sommer Date: Tue, 30 May 2023 15:45:54 +0000 Subject: [PATCH 094/704] [mlir][llvm] Add LLVM TargetExtType Add support for the `llvm::TargetExtType` to the MLIR LLVM dialect. Target extension types were introduced to represent target-specific types, which are opaque to the compiler and optimizations. The patch also enforces some of the constraints defined for the target extension type in the LLVM language reference manual. Signed-off-by: Lukas Sommer Reviewed By: ftynse, gysit, Dinistro Differential Revision: https://reviews.llvm.org/D151446 --- .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td | 17 +++- mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td | 36 ++++++++ mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 43 ++++++++- mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp | 4 +- mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp | 89 ++++++++++++++++++- mlir/lib/Target/LLVMIR/ModuleImport.cpp | 17 ++++ mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 10 +++ mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp | 11 ++- mlir/lib/Target/LLVMIR/TypeToLLVM.cpp | 10 ++- mlir/test/Dialect/LLVMIR/global.mlir | 13 +++ mlir/test/Dialect/LLVMIR/invalid.mlir | 36 ++++++++ mlir/test/Dialect/LLVMIR/types-invalid.mlir | 15 ++++ mlir/test/Dialect/LLVMIR/types.mlir | 17 ++++ .../Target/LLVMIR/Import/target-ext-type.ll | 53 +++++++++++ mlir/test/Target/LLVMIR/llvmir-types.mlir | 12 +++ mlir/test/Target/LLVMIR/target-ext-type.mlir | 28 ++++++ 16 files changed, 403 insertions(+), 8 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/Import/target-ext-type.ll create mode 100644 mlir/test/Target/LLVMIR/target-ext-type.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td index 00a3b80ecf69a..809e8ed4a332f 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td @@ -84,10 +84,23 @@ def LLVM_OpaqueStruct : Type< And<[LLVM_AnyStruct.predicate, CPred<"::llvm::cast<::mlir::LLVM::LLVMStructType>($_self).isOpaque()">]>>; +// Type constraint accepting any LLVM target extension type. +def LLVM_AnyTargetExt : Type($_self)">, + "LLVM target extension type">; + +// Type constraint accepting LLVM target extension types with no support for +// memory operations such as alloca, load and store. +def LLVM_NonLoadableTargetExtType : Type< + And<[LLVM_AnyTargetExt.predicate, + CPred<"!::llvm::cast<::mlir::LLVM::LLVMTargetExtType>($_self).supportsMemOps()">] + >>; + // Type constraint accepting any LLVM type that can be loaded or stored, i.e. a -// type that has size (not void, function or opaque struct type). +// type that has size (not void, function, opaque struct type or target +// extension type which does not support memory operations). def LLVM_LoadableType : Type< - Or<[And<[LLVM_PrimitiveType.predicate, Neg]>, + Or<[And<[LLVM_PrimitiveType.predicate, Neg, + Neg]>, LLVM_PointerElementTypeInterface.predicate]>, "LLVM type with size">; diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td index e26d9d8acc79e..5f746e21e831b 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td @@ -225,4 +225,40 @@ def LLVMScalableVectorType : LLVMType<"LLVMScalableVector", "vec"> { }]; } +//===----------------------------------------------------------------------===// +// LLVMTargetExtType +//===----------------------------------------------------------------------===// + +def LLVMTargetExtType : LLVMType<"LLVMTargetExt", "target"> { + let summary = "LLVM target-specific extension type"; + let description = [{ + LLVM dialect target extension type, which are generally unintrospectable + from target-independent optimizations. + + Target extension types have a string name, and optionally have type and/or + integer parameters. The exact meaning of any parameters is dependent on the + target. + }]; + + let parameters = (ins StringRefParameter<>:$extTypeName, + OptionalArrayRefParameter<"Type">:$typeParams, + OptionalArrayRefParameter<"unsigned int">:$intParams); + + let assemblyFormat = [{ + `<` $extTypeName (`,` custom($typeParams, $intParams)^ )? `>` + }]; + + let extraClassDeclaration = [{ + enum Property { + /// zeroinitializer is valid for this target extension type. + HasZeroInit = 1U << 0, + /// This type may be used as the value type of a global variable. + CanBeGlobal = 1U << 1, + }; + + bool hasProperty(Property Prop) const; + bool supportsMemOps() const; + }]; +} + #endif // LLVMTYPES_TD diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 06e8d79f68a6e..a31daaab17e1d 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -266,8 +266,19 @@ static LogicalResult verifyOpaquePtr(Operation *op, LLVMPointerType ptrType, } LogicalResult AllocaOp::verify() { - return verifyOpaquePtr(getOperation(), llvm::cast(getType()), - getElemType()); + LLVMPointerType ptrType = llvm::cast(getType()); + if (failed(verifyOpaquePtr(getOperation(), ptrType, getElemType()))) + return failure(); + + Type elemTy = + (ptrType.isOpaque()) ? *getElemType() : ptrType.getElementType(); + // Only certain target extension types can be used in 'alloca'. + if (auto targetExtType = dyn_cast(elemTy); + targetExtType && !targetExtType.supportsMemOps()) + return emitOpError() + << "this target extension type cannot be used in alloca"; + + return success(); } //===----------------------------------------------------------------------===// @@ -1832,6 +1843,22 @@ LogicalResult GlobalOp::verify() { "attribute"); } + if (auto targetExtType = dyn_cast(getType())) { + if (!targetExtType.hasProperty(LLVMTargetExtType::CanBeGlobal)) + return emitOpError() + << "this target extension type cannot be used in a global"; + + if (Attribute value = getValueOrNull()) { + // Only a single, zero integer attribute (=zeroinitializer) is allowed for + // a global value with TargetExtType. + // TODO: Replace with 'zeroinitializer' once there is a dedicated + // zeroinitializer operation in the LLVM dialect. + if (!isa(value) || !isZeroAttribute(value)) + return emitOpError() + << "expected zero value for global with target extension type"; + } + } + if (getLinkage() == Linkage::Common) { if (Attribute value = getValueOrNull()) { if (!isZeroAttribute(value)) { @@ -2288,6 +2315,18 @@ LogicalResult LLVM::ConstantOp::verify() { } return success(); } + if (auto targetExtType = dyn_cast(getType())) { + if (!targetExtType.hasProperty(LLVM::LLVMTargetExtType::HasZeroInit)) + return emitOpError() + << "target extension type does not support zero-initializer"; + // Only a single, zero integer attribute (=zeroinitializer) is allowed for a + // global value with TargetExtType. + if (!isa(getValue()) || !isZeroAttribute(getValue())) + return emitOpError() + << "only zero-initializer allowed for target extension types"; + + return success(); + } if (!llvm::isa(getValue())) return emitOpError() << "only supports integer, float, string or elements attributes"; diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp index 742ab5b632663..afb8c90606191 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp @@ -45,6 +45,7 @@ static StringRef getTypeKeyword(Type type) { [&](Type) { return "vec"; }) .Case([&](Type) { return "array"; }) .Case([&](Type) { return "struct"; }) + .Case([&](Type) { return "target"; }) .Default([](Type) -> StringRef { llvm_unreachable("unexpected 'llvm' type kind"); }); @@ -119,7 +120,7 @@ void mlir::LLVM::detail::printType(Type type, AsmPrinter &printer) { llvm::TypeSwitch(type) .Case( + LLVMScalableVectorType, LLVMFunctionType, LLVMTargetExtType>( [&](auto type) { type.print(printer); }) .Case([&](LLVMStructType structType) { printStructType(printer, structType); @@ -332,6 +333,7 @@ static Type dispatchParse(AsmParser &parser, bool allowAny = true) { .Case("vec", [&] { return parseVectorType(parser); }) .Case("array", [&] { return LLVMArrayType::parse(parser); }) .Case("struct", [&] { return parseStructType(parser); }) + .Case("target", [&] { return LLVMTargetExtType::parse(parser); }) .Default([&] { parser.emitError(keyLoc) << "unknown LLVM type: " << key; return Type(); diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp index be129ffe2aadc..95d76a14d2bd3 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp @@ -109,6 +109,59 @@ static void printPointer(AsmPrinter &p, Type elementType, } } +//===----------------------------------------------------------------------===// +// custom +//===----------------------------------------------------------------------===// + +/// Parses the parameter list for a target extension type. The parameter list +/// contains an optional list of type parameters, followed by an optional list +/// of integer parameters. Type and integer parameters cannot be interleaved in +/// the list. +/// extTypeParams ::= typeList? | intList? | (typeList "," intList) +/// typeList ::= type ("," type)* +/// intList ::= integer ("," integer)* +static ParseResult +parseExtTypeParams(AsmParser &p, SmallVectorImpl &typeParams, + SmallVectorImpl &intParams) { + bool parseType = true; + auto typeOrIntParser = [&]() -> ParseResult { + unsigned int i; + auto intResult = p.parseOptionalInteger(i); + if (intResult.has_value() && !failed(*intResult)) { + // Successfully parsed an integer. + intParams.push_back(i); + // After the first integer was successfully parsed, no + // more types can be parsed. + parseType = false; + return success(); + } + if (parseType) { + Type t; + if (!parsePrettyLLVMType(p, t)) { + // Successfully parsed a type. + typeParams.push_back(t); + return success(); + } + } + return failure(); + }; + if (p.parseCommaSeparatedList(typeOrIntParser)) { + p.emitError(p.getCurrentLocation(), + "failed to parse parameter list for target extension type"); + return failure(); + } + return success(); +} + +static void printExtTypeParams(AsmPrinter &p, ArrayRef typeParams, + ArrayRef intParams) { + p << typeParams; + if (!typeParams.empty() && !intParams.empty()) + p << ", "; + + p << intParams; +} + //===----------------------------------------------------------------------===// // ODS-Generated Definitions //===----------------------------------------------------------------------===// @@ -721,6 +774,35 @@ LLVMScalableVectorType::verify(function_ref emitError, emitError, elementType, numElements); } +//===----------------------------------------------------------------------===// +// LLVMTargetExtType. +//===----------------------------------------------------------------------===// + +static constexpr llvm::StringRef kSpirvPrefix = "spirv."; +static constexpr llvm::StringRef kArmSVCount = "aarch64.svcount"; + +bool LLVM::LLVMTargetExtType::hasProperty(Property prop) const { + // See llvm/lib/IR/Type.cpp for reference. + uint64_t properties = 0; + + if (getExtTypeName().starts_with(kSpirvPrefix)) + properties |= + (LLVMTargetExtType::HasZeroInit | LLVM::LLVMTargetExtType::CanBeGlobal); + + return (properties & prop) == prop; +} + +bool LLVM::LLVMTargetExtType::supportsMemOps() const { + // See llvm/lib/IR/Type.cpp for reference. + if (getExtTypeName().starts_with(kSpirvPrefix)) + return true; + + if (getExtTypeName() == kArmSVCount) + return true; + + return false; +} + //===----------------------------------------------------------------------===// // Utility functions. //===----------------------------------------------------------------------===// @@ -746,6 +828,7 @@ bool mlir::LLVM::isCompatibleOuterType(Type type) { LLVMTokenType, LLVMFixedVectorType, LLVMScalableVectorType, + LLVMTargetExtType, LLVMVoidType, LLVMX86MMXType >(type)) { @@ -791,6 +874,9 @@ static bool isCompatibleImpl(Type type, DenseSet &compatibleTypes) { return true; return isCompatible(pointerType.getElementType()); }) + .Case([&](auto extType) { + return llvm::all_of(extType.getTypeParams(), isCompatible); + }) // clang-format off .Case< LLVMFixedVectorType, @@ -974,7 +1060,8 @@ llvm::TypeSize mlir::LLVM::getPrimitiveTypeSizeInBits(Type type) { .Default([](Type ty) { assert((llvm::isa(ty)) && + LLVMPointerType, LLVMFunctionType, LLVMTargetExtType>( + ty)) && "unexpected missing support for primitive type"); return llvm::TypeSize::Fixed(0); }); diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index 05d6b7827d83a..5f9eb1835cd2d 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1065,6 +1065,23 @@ FailureOr ModuleImport::convertConstant(llvm::Constant *constant) { return root; } + if (auto *constTargetNone = dyn_cast(constant)) { + LLVMTargetExtType targetExtType = + cast(convertType(constTargetNone->getType())); + assert(targetExtType.hasProperty(LLVMTargetExtType::HasZeroInit) && + "target extension type does not support zero-initialization"); + // As the number of values needed for initialization is target-specific and + // opaque to the compiler, use a single i64 zero-valued attribute to + // represent the 'zeroinitializer', which is the only constant value allowed + // for target extension types (besides poison and undef). + // TODO: Replace with 'zeroinitializer' once there is a dedicated + // zeroinitializer operation in the LLVM dialect. + return builder + .create(loc, targetExtType, + builder.getI64IntegerAttr(0)) + .getRes(); + } + StringRef error = ""; if (isa(constant)) error = " since blockaddress(...) is unsupported"; diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 772721e31e1ce..9b8e9a3ee1f3e 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -342,6 +342,16 @@ llvm::Constant *mlir::LLVM::detail::getLLVMConstant( return nullptr; return llvm::ConstantStruct::get(structType, {real, imag}); } + if (auto *targetExtType = dyn_cast<::llvm::TargetExtType>(llvmType)) { + // TODO: Replace with 'zeroinitializer' once there is a dedicated + // zeroinitializer operation in the LLVM dialect. + auto intAttr = dyn_cast(attr); + if (!intAttr || intAttr.getInt() != 0) + emitError(loc, + "Only zero-initialization allowed for target extension type"); + + return llvm::ConstantTargetNone::get(targetExtType); + } // For integer types, we allow a mismatch in sizes as the index type in // MLIR might have a different size than the index type in the LLVM module. if (auto intAttr = dyn_cast(attr)) diff --git a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp index 26e426b023272..458e71953e6cf 100644 --- a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp +++ b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp @@ -36,7 +36,7 @@ class TypeFromLLVMIRTranslatorImpl { llvm::TypeSwitch(type) .Case( + llvm::ScalableVectorType, llvm::TargetExtType>( [this](auto *type) { return this->translate(type); }) .Default([this](llvm::Type *type) { return translatePrimitiveType(type); @@ -135,6 +135,15 @@ class TypeFromLLVMIRTranslatorImpl { translateType(type->getElementType()), type->getMinNumElements()); } + /// Translates the given target extension type. + Type translate(llvm::TargetExtType *type) { + SmallVector typeParams; + translateTypes(type->type_params(), typeParams); + + return LLVM::LLVMTargetExtType::get(&context, type->getName(), typeParams, + type->int_params()); + } + /// Translates a list of types. void translateTypes(ArrayRef types, SmallVectorImpl &result) { diff --git a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp index d3ecede279156..6d8b415ff09dc 100644 --- a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp +++ b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp @@ -73,7 +73,7 @@ class TypeToLLVMIRTranslatorImpl { .Case( + VectorType, LLVM::LLVMTargetExtType>( [this](auto type) { return this->translate(type); }) .Default([](Type t) -> llvm::Type * { llvm_unreachable("unknown LLVM dialect type"); @@ -155,6 +155,14 @@ class TypeToLLVMIRTranslatorImpl { type.getMinNumElements()); } + /// Translates the given target extension type. + llvm::Type *translate(LLVM::LLVMTargetExtType type) { + SmallVector typeParams; + translateTypes(type.getTypeParams(), typeParams); + return llvm::TargetExtType::get(context, type.getExtTypeName(), typeParams, + type.getIntParams()); + } + /// Translates a list of types. void translateTypes(ArrayRef types, SmallVectorImpl &result) { diff --git a/mlir/test/Dialect/LLVMIR/global.mlir b/mlir/test/Dialect/LLVMIR/global.mlir index c53fdeff925d2..00b73f0549fab 100644 --- a/mlir/test/Dialect/LLVMIR/global.mlir +++ b/mlir/test/Dialect/LLVMIR/global.mlir @@ -232,3 +232,16 @@ llvm.func @dtor() { // CHECK: llvm.mlir.global_dtors {dtors = [@dtor], priorities = [0 : i32]} llvm.mlir.global_dtors { dtors = [@dtor], priorities = [0 : i32]} + +// ----- + +// CHECK: llvm.mlir.global external @target_ext() {addr_space = 0 : i32} : !llvm.target<"spirv.Image", i32, 0> +llvm.mlir.global @target_ext() : !llvm.target<"spirv.Image", i32, 0> + +// CHECK: llvm.mlir.global external @target_ext_init(0 : i64) {addr_space = 0 : i32} : !llvm.target<"spirv.Image", i32, 0> +llvm.mlir.global @target_ext_init(0 : i64) : !llvm.target<"spirv.Image", i32, 0> + +// ----- + +// expected-error @+1 {{expected zero value for global with target extension type}} +llvm.mlir.global @target_fail(1 : i64) : !llvm.target<"spirv.Image", i32, 0> diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir index aa3498a5ee950..b88619b1e388d 100644 --- a/mlir/test/Dialect/LLVMIR/invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/invalid.mlir @@ -1387,3 +1387,39 @@ func.func @invalid_bitcast_addr_cast_vec(%arg : !llvm.vec<4 x ptr<1>>) { // expected-error@+1 {{cannot cast pointers of different address spaces, use 'llvm.addrspacecast' instead}} %0 = llvm.bitcast %arg : !llvm.vec<4 x ptr<1>> to !llvm.vec<4 x ptr> } + +// ----- + +func.func @invalid_target_ext_alloca() { + %0 = llvm.mlir.constant(1 : i64) : i64 + // expected-error@+1 {{this target extension type cannot be used in alloca}} + %1 = llvm.alloca %0 x !llvm.target<"no_alloca"> : (i64) -> !llvm.ptr +} + +// ----- + +func.func @invalid_target_ext_load(%arg0 : !llvm.ptr) { + // expected-error@+1 {{result #0 must be LLVM type with size, but got '!llvm.target<"no_load">'}} + %0 = llvm.load %arg0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.target<"no_load"> +} + +// ----- + +func.func @invalid_target_ext_atomic(%arg0 : !llvm.ptr) { + // expected-error@+1 {{unsupported type '!llvm.target<"spirv.Event">' for atomic access}} + %0 = llvm.load %arg0 atomic monotonic {alignment = 8 : i64} : !llvm.ptr -> !llvm.target<"spirv.Event"> +} + +// ----- + +func.func @invalid_target_ext_constant() { + // expected-error@+1 {{target extension type does not support zero-initializer}} + %0 = llvm.mlir.constant(0 : index) : !llvm.target<"invalid_constant"> +} + +// ----- + +func.func @invalid_target_ext_constant() { + // expected-error@+1 {{only zero-initializer allowed for target extension types}} + %0 = llvm.mlir.constant(42 : index) : !llvm.target<"spirv.Event"> +} diff --git a/mlir/test/Dialect/LLVMIR/types-invalid.mlir b/mlir/test/Dialect/LLVMIR/types-invalid.mlir index fce100e6a865c..f06f056cf4904 100644 --- a/mlir/test/Dialect/LLVMIR/types-invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/types-invalid.mlir @@ -158,3 +158,18 @@ func.func private @unexpected_type() -> !llvm.f32 // expected-error @below {{cannot use !llvm.vec for built-in primitives, use 'vector' instead}} func.func private @llvm_vector_primitive() -> !llvm.vec<4 x f32> + +// ----- + +func.func private @target_ext_invalid_order() { + // expected-error @+1 {{failed to parse parameter list for target extension type}} + "some.op"() : () -> !llvm.target<"target1", 5, i32, 1> +} + +// ----- + +func.func private @target_ext_no_name() { + // expected-error@below {{expected string}} + // expected-error@below {{failed to parse LLVMTargetExtType parameter 'extTypeName' which is to be a `::llvm::StringRef`}} + "some.op"() : () -> !llvm.target +} diff --git a/mlir/test/Dialect/LLVMIR/types.mlir b/mlir/test/Dialect/LLVMIR/types.mlir index 42352ce697f02..c9bce337a3b8a 100644 --- a/mlir/test/Dialect/LLVMIR/types.mlir +++ b/mlir/test/Dialect/LLVMIR/types.mlir @@ -176,3 +176,20 @@ llvm.func @aliases() { "some.op"() : () -> !llvm.struct<(i32, f32, !qux)> llvm.return } + +// ----- + +// CHECK-LABEL: ext_target +llvm.func @ext_target() { + // CHECK: !llvm.target<"target1", i32, 1> + %0 = "some.op"() : () -> !llvm.target<"target1", i32, 1> + // CHECK: !llvm.target<"target2"> + %1 = "some.op"() : () -> !llvm.target<"target2"> + // CHECK: !llvm.target<"target3", i32, i64, f64> + %2 = "some.op"() : () -> !llvm.target<"target3", i32, i64, f64> + // CHECK: !llvm.target<"target4", 1, 0, 42> + %3 = "some.op"() : () -> !llvm.target<"target4", 1, 0, 42> + // CHECK: !llvm.target<"target5", i32, f64, 0, 5> + %4 = "some.op"() : () -> !llvm.target<"target5", i32, f64, 0, 5> + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/Import/target-ext-type.ll b/mlir/test/Target/LLVMIR/Import/target-ext-type.ll new file mode 100644 index 0000000000000..62194cad9152c --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/target-ext-type.ll @@ -0,0 +1,53 @@ +; RUN: mlir-translate -import-llvm %s | FileCheck %s + +; CHECK-LABEL: llvm.mlir.global external @global() {addr_space = 0 : i32} +; CHECK-SAME: !llvm.target<"spirv.DeviceEvent"> +; CHECK-NEXT: %0 = llvm.mlir.constant(0 : i64) : !llvm.target<"spirv.DeviceEvent"> +; CHECK-NEXT: llvm.return %0 : !llvm.target<"spirv.DeviceEvent"> +@global = global target("spirv.DeviceEvent") zeroinitializer + +; CHECK-LABEL: llvm.func spir_kernelcc @func1( +define spir_kernel void @func1( + ; CHECK-SAME: %arg0: !llvm.target<"spirv.Pipe", 0> + target("spirv.Pipe", 0) %a, + ; CHECK-SAME: %arg1: !llvm.target<"spirv.Pipe", 1> + target("spirv.Pipe", 1) %b, + ; CHECK-SAME: %arg2: !llvm.target<"spirv.Image", !llvm.void, 0, 0, 0, 0, 0, 0, 0> + target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0) %c1, + ; CHECK-SAME: %arg3: !llvm.target<"spirv.Image", i32, 1, 0, 0, 0, 0, 0, 0> + target("spirv.Image", i32, 1, 0, 0, 0, 0, 0, 0) %d1, + ; CHECK-SAME: %arg4: !llvm.target<"spirv.Image", i32, 2, 0, 0, 0, 0, 0, 0> + target("spirv.Image", i32, 2, 0, 0, 0, 0, 0, 0) %e1, + ; CHECK-SAME: %arg5: !llvm.target<"spirv.Image", f16, 1, 0, 1, 0, 0, 0, 0> + target("spirv.Image", half, 1, 0, 1, 0, 0, 0, 0) %f1, + ; CHECK-SAME: %arg6: !llvm.target<"spirv.Image", f32, 5, 0, 0, 0, 0, 0, 0> + target("spirv.Image", float, 5, 0, 0, 0, 0, 0, 0) %g1, + ; CHECK-SAME: %arg7: !llvm.target<"spirv.Image", !llvm.void, 0, 0, 0, 0, 0, 0, 1> + target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 1) %c2, + ; CHECK-SAME: %arg8: !llvm.target<"spirv.Image", !llvm.void, 1, 0, 0, 0, 0, 0, 2>) + target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 2) %d3) { +entry: + ret void +} + +; CHECK-LABEL: llvm.func @func2() +; CHECK-SAME: !llvm.target<"spirv.Event"> { +define target("spirv.Event") @func2() { + ; CHECK-NEXT: %0 = llvm.mlir.constant(1 : i32) : i32 + ; CHECK-NEXT: %1 = llvm.mlir.poison : !llvm.target<"spirv.Event"> + ; CHECK-NEXT: %2 = llvm.alloca %0 x !llvm.target<"spirv.Event"> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %mem = alloca target("spirv.Event") + ; CHECK-NEXT: %3 = llvm.load %2 {alignment = 8 : i64} : !llvm.ptr -> !llvm.target<"spirv.Event"> + %val = load target("spirv.Event"), ptr %mem + ; CHECK-NEXT: llvm.return %1 : !llvm.target<"spirv.Event"> + ret target("spirv.Event") poison +} + +; CHECK-LABEL: llvm.func @func3() +define void @func3() { + ; CHECK-NEXT: %0 = llvm.mlir.constant(0 : i64) : !llvm.target<"spirv.DeviceEvent"> + ; CHECK-NEXT: %1 = llvm.freeze %0 : !llvm.target<"spirv.DeviceEvent"> + %val = freeze target("spirv.DeviceEvent") zeroinitializer + ; CHECK-NEXT: llvm.return + ret void +} diff --git a/mlir/test/Target/LLVMIR/llvmir-types.mlir b/mlir/test/Target/LLVMIR/llvmir-types.mlir index 9d972f6fa6b63..a92d46dfadfe2 100644 --- a/mlir/test/Target/LLVMIR/llvmir-types.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-types.mlir @@ -141,6 +141,18 @@ llvm.func @return_s_sp_i32() -> !llvm.struct<(struct)> // CHECK: declare <{ { i32 } }> @return_sp_s_i32() llvm.func @return_sp_s_i32() -> !llvm.struct)> +// CHECK: declare target("target-no-param") @return_target_ext_no_param() +llvm.func @return_target_ext_no_param() -> !llvm.target<"target-no-param"> + +// CHECK: declare target("target-type-param", i32, double) @return_target_ext_type_params() +llvm.func @return_target_ext_type_params() -> !llvm.target<"target-type-param", i32, f64> + +// CHECK: declare target("target-int-param", 0, 42) @return_target_ext_int_params() +llvm.func @return_target_ext_int_params() -> !llvm.target<"target-int-param", 0, 42> + +// CHECK: declare target("target-params", i32, double, 0, 5) @return_target_ext_params() +llvm.func @return_target_ext_params() -> !llvm.target<"target-params", i32, f64, 0, 5> + // ----- // Put structs into a separate split so that we can match their declarations // locally. diff --git a/mlir/test/Target/LLVMIR/target-ext-type.mlir b/mlir/test/Target/LLVMIR/target-ext-type.mlir new file mode 100644 index 0000000000000..e7004b2699dc6 --- /dev/null +++ b/mlir/test/Target/LLVMIR/target-ext-type.mlir @@ -0,0 +1,28 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// CHECK: @global = global target("spirv.DeviceEvent") zeroinitializer +llvm.mlir.global external @global() {addr_space = 0 : i32} : !llvm.target<"spirv.DeviceEvent"> { + %0 = llvm.mlir.constant(0 : i64) : !llvm.target<"spirv.DeviceEvent"> + llvm.return %0 : !llvm.target<"spirv.DeviceEvent"> +} + +// CHECK-LABEL: define target("spirv.Event") @func2() { +// CHECK-NEXT: %1 = alloca target("spirv.Event"), align 8 +// CHECK-NEXT: %2 = load target("spirv.Event"), ptr %1, align 8 +// CHECK-NEXT: ret target("spirv.Event") poison +llvm.func @func2() -> !llvm.target<"spirv.Event"> { + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.mlir.poison : !llvm.target<"spirv.Event"> + %2 = llvm.alloca %0 x !llvm.target<"spirv.Event"> {alignment = 8 : i64} : (i32) -> !llvm.ptr + %3 = llvm.load %2 {alignment = 8 : i64} : !llvm.ptr -> !llvm.target<"spirv.Event"> + llvm.return %1 : !llvm.target<"spirv.Event"> +} + +// CHECK-LABEL: define void @func3() { +// CHECK-NEXT: %1 = freeze target("spirv.DeviceEvent") zeroinitializer +// CHECK-NEXT: ret void +llvm.func @func3() { + %0 = llvm.mlir.constant(0 : i64) : !llvm.target<"spirv.DeviceEvent"> + %1 = llvm.freeze %0 : !llvm.target<"spirv.DeviceEvent"> + llvm.return +} From 7cdb875d4dcd5bb65708521dc121e3c969c561b9 Mon Sep 17 00:00:00 2001 From: Alex Zinenko Date: Tue, 30 May 2023 15:58:45 +0000 Subject: [PATCH 095/704] [mlir] silence msvc warning --- mlir/examples/transform/Ch3/lib/MyExtension.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/examples/transform/Ch3/lib/MyExtension.cpp b/mlir/examples/transform/Ch3/lib/MyExtension.cpp index 41b3ffef1b498..a4f9108df0e8c 100644 --- a/mlir/examples/transform/Ch3/lib/MyExtension.cpp +++ b/mlir/examples/transform/Ch3/lib/MyExtension.cpp @@ -76,8 +76,8 @@ void MyExtension::init() { // ODS generates these helpers for type printing and parsing, but the // Transform dialect provides its own support for types supplied by the // extension. Reference these functions to avoid a compiler warning. - (void)generatedTypeParser; - (void)generatedTypePrinter; + (void)&generatedTypeParser; + (void)&generatedTypePrinter; // Finally, we register the additional transform operations with the dialect. // List all operations generated from ODS. This call will perform additional From 0da99ffe1afc526844f4146c95b4b2ab251de1a9 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Tue, 30 May 2023 18:02:10 +0200 Subject: [PATCH 096/704] [clang][analyzer][NFC] Remove unnecessary casts around Allocate function calls Reviewed By: steakhal Differential Revision: https://reviews.llvm.org/D151726 --- .../StaticAnalyzer/Core/BasicValueFactory.cpp | 12 +++++------ .../lib/StaticAnalyzer/Core/ExplodedGraph.cpp | 4 ++-- .../lib/StaticAnalyzer/Core/ProgramState.cpp | 2 +- .../lib/StaticAnalyzer/Core/SymbolManager.cpp | 20 +++++++++---------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp index fc736dd452aab..5a5851975bb60 100644 --- a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp +++ b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp @@ -97,7 +97,7 @@ const llvm::APSInt& BasicValueFactory::getValue(const llvm::APSInt& X) { FoldNodeTy* P = APSIntSet.FindNodeOrInsertPos(ID, InsertPos); if (!P) { - P = (FoldNodeTy*) BPAlloc.Allocate(); + P = BPAlloc.Allocate(); new (P) FoldNodeTy(X); APSIntSet.InsertNode(P, InsertPos); } @@ -132,7 +132,7 @@ BasicValueFactory::getCompoundValData(QualType T, CompoundValData* D = CompoundValDataSet.FindNodeOrInsertPos(ID, InsertPos); if (!D) { - D = (CompoundValData*) BPAlloc.Allocate(); + D = BPAlloc.Allocate(); new (D) CompoundValData(T, Vals); CompoundValDataSet.InsertNode(D, InsertPos); } @@ -151,7 +151,7 @@ BasicValueFactory::getLazyCompoundValData(const StoreRef &store, LazyCompoundValDataSet.FindNodeOrInsertPos(ID, InsertPos); if (!D) { - D = (LazyCompoundValData*) BPAlloc.Allocate(); + D = BPAlloc.Allocate(); new (D) LazyCompoundValData(store, region); LazyCompoundValDataSet.InsertNode(D, InsertPos); } @@ -169,7 +169,7 @@ const PointerToMemberData *BasicValueFactory::getPointerToMemberData( PointerToMemberDataSet.FindNodeOrInsertPos(ID, InsertPos); if (!D) { - D = (PointerToMemberData *)BPAlloc.Allocate(); + D = BPAlloc.Allocate(); new (D) PointerToMemberData(ND, L); PointerToMemberDataSet.InsertNode(D, InsertPos); } @@ -358,7 +358,7 @@ BasicValueFactory::getPersistentSValWithData(const SVal& V, uintptr_t Data) { FoldNodeTy* P = Map.FindNodeOrInsertPos(ID, InsertPos); if (!P) { - P = (FoldNodeTy*) BPAlloc.Allocate(); + P = BPAlloc.Allocate(); new (P) FoldNodeTy(std::make_pair(V, Data)); Map.InsertNode(P, InsertPos); } @@ -383,7 +383,7 @@ BasicValueFactory::getPersistentSValPair(const SVal& V1, const SVal& V2) { FoldNodeTy* P = Map.FindNodeOrInsertPos(ID, InsertPos); if (!P) { - P = (FoldNodeTy*) BPAlloc.Allocate(); + P = BPAlloc.Allocate(); new (P) FoldNodeTy(std::make_pair(V1, V2)); Map.InsertNode(P, InsertPos); } diff --git a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp index cac7ec3f8cf7f..314a4feda81b3 100644 --- a/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExplodedGraph.cpp @@ -408,7 +408,7 @@ ExplodedNode *ExplodedGraph::getNode(const ProgramPoint &L, } else { // Allocate a new node. - V = (NodeTy*) getAllocator().Allocate(); + V = getAllocator().Allocate(); } ++NumNodes; @@ -432,7 +432,7 @@ ExplodedNode *ExplodedGraph::createUncachedNode(const ProgramPoint &L, ProgramStateRef State, int64_t Id, bool IsSink) { - NodeTy *V = (NodeTy *) getAllocator().Allocate(); + NodeTy *V = getAllocator().Allocate(); new (V) NodeTy(L, State, Id, IsSink); return V; } diff --git a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp index 90ebbaad2bf3a..e90ebab43c78e 100644 --- a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp +++ b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp @@ -424,7 +424,7 @@ ProgramStateRef ProgramStateManager::getPersistentState(ProgramState &State) { freeStates.pop_back(); } else { - newState = (ProgramState*) Alloc.Allocate(); + newState = Alloc.Allocate(); } new (newState) ProgramState(State); StateSet.InsertNode(newState, InsertPos); diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp index 032605ffe7a23..b4f64bc3a7b3e 100644 --- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp +++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp @@ -170,7 +170,7 @@ SymbolManager::getRegionValueSymbol(const TypedValueRegion* R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = (SymExpr*) BPAlloc.Allocate(); + SD = BPAlloc.Allocate(); new (SD) SymbolRegionValue(SymbolCounter, R); DataSet.InsertNode(SD, InsertPos); ++SymbolCounter; @@ -189,7 +189,7 @@ const SymbolConjured* SymbolManager::conjureSymbol(const Stmt *E, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = (SymExpr*) BPAlloc.Allocate(); + SD = BPAlloc.Allocate(); new (SD) SymbolConjured(SymbolCounter, E, LCtx, T, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); ++SymbolCounter; @@ -206,7 +206,7 @@ SymbolManager::getDerivedSymbol(SymbolRef parentSymbol, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = (SymExpr*) BPAlloc.Allocate(); + SD = BPAlloc.Allocate(); new (SD) SymbolDerived(SymbolCounter, parentSymbol, R); DataSet.InsertNode(SD, InsertPos); ++SymbolCounter; @@ -222,7 +222,7 @@ SymbolManager::getExtentSymbol(const SubRegion *R) { void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = (SymExpr*) BPAlloc.Allocate(); + SD = BPAlloc.Allocate(); new (SD) SymbolExtent(SymbolCounter, R); DataSet.InsertNode(SD, InsertPos); ++SymbolCounter; @@ -240,7 +240,7 @@ SymbolManager::getMetadataSymbol(const MemRegion* R, const Stmt *S, QualType T, void *InsertPos; SymExpr *SD = DataSet.FindNodeOrInsertPos(profile, InsertPos); if (!SD) { - SD = (SymExpr*) BPAlloc.Allocate(); + SD = BPAlloc.Allocate(); new (SD) SymbolMetadata(SymbolCounter, R, S, T, LCtx, Count, SymbolTag); DataSet.InsertNode(SD, InsertPos); ++SymbolCounter; @@ -257,7 +257,7 @@ SymbolManager::getCastSymbol(const SymExpr *Op, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = (SymbolCast*) BPAlloc.Allocate(); + data = BPAlloc.Allocate(); new (data) SymbolCast(Op, From, To); DataSet.InsertNode(data, InsertPos); } @@ -275,7 +275,7 @@ const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = (SymIntExpr*) BPAlloc.Allocate(); + data = BPAlloc.Allocate(); new (data) SymIntExpr(lhs, op, v, t); DataSet.InsertNode(data, InsertPos); } @@ -293,7 +293,7 @@ const IntSymExpr *SymbolManager::getIntSymExpr(const llvm::APSInt& lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = (IntSymExpr*) BPAlloc.Allocate(); + data = BPAlloc.Allocate(); new (data) IntSymExpr(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -311,7 +311,7 @@ const SymSymExpr *SymbolManager::getSymSymExpr(const SymExpr *lhs, SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = (SymSymExpr*) BPAlloc.Allocate(); + data = BPAlloc.Allocate(); new (data) SymSymExpr(lhs, op, rhs, t); DataSet.InsertNode(data, InsertPos); } @@ -327,7 +327,7 @@ const UnarySymExpr *SymbolManager::getUnarySymExpr(const SymExpr *Operand, void *InsertPos; SymExpr *data = DataSet.FindNodeOrInsertPos(ID, InsertPos); if (!data) { - data = (UnarySymExpr *)BPAlloc.Allocate(); + data = BPAlloc.Allocate(); new (data) UnarySymExpr(Operand, Opc, T); DataSet.InsertNode(data, InsertPos); } From daa95c7de5b7d004bd6c48f5099b7b88f1f5d16d Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Tue, 30 May 2023 18:10:58 +0200 Subject: [PATCH 097/704] [clang][analyzer][NFC] Remove unnecessary FALLTHROUGH markers They are redundant with the [[fallthrough]]; attribute that follows. Reviewed By: steakhal Differential Revision: https://reviews.llvm.org/D151723 --- clang/lib/CodeGen/CGCall.cpp | 1 - clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 1 - clang/lib/StaticAnalyzer/Core/SValBuilder.cpp | 1 - 3 files changed, 3 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index ec28c1db207a6..09ccb63dceeb5 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5716,7 +5716,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, assert(unpaddedIndex == 0); Builder.CreateStore(elt, eltAddr); } - // FALLTHROUGH [[fallthrough]]; } diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index bd5781a81bb5b..194a592fc019a 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -2121,7 +2121,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, } } } - // FALLTHROUGH [[fallthrough]]; } diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp index fed17c77f03d0..4fe828bdf7681 100644 --- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp +++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp @@ -395,7 +395,6 @@ std::optional SValBuilder::getConstantVal(const Expr *E) { return evalCast(*Val, CE->getType(), SE->getType()); } } - // FALLTHROUGH [[fallthrough]]; } From 40a81d3100b416393557f015efc971497c0bea46 Mon Sep 17 00:00:00 2001 From: Igor Kirillov Date: Wed, 19 Apr 2023 10:11:33 +0000 Subject: [PATCH 098/704] [CodeGen] Refactor IR generation functions to use IRBuilder in ComplexDeinterleaving pass This patch updates several functions in LLVM's IR generation code to accept an IRBuilder object as an argument, rather than an Instruction that indicates the insertion point for new instructions. This change is necessary to handle sophisticated -Ofast optimization cases from D148558 where it's unclear which instructions should be used as the insertion point for new operations. Differential Revision: https://reviews.llvm.org/D148703 --- llvm/include/llvm/CodeGen/TargetLowering.h | 2 +- .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 30 +++++++++---------- .../Target/AArch64/AArch64ISelLowering.cpp | 8 ++--- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 8 ++--- llvm/lib/Target/ARM/ARMISelLowering.h | 2 +- .../complex-deinterleaving-mixed-cases.ll | 10 +++---- 7 files changed, 29 insertions(+), 33 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index b2d73b286b0ad..908d881d7f6da 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3196,7 +3196,7 @@ class TargetLoweringBase { /// If one cannot be created using all the given inputs, nullptr should be /// returned. virtual Value *createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator = nullptr) const { return nullptr; diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index 5f06a666a5f2e..4351d68ebc87c 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -267,7 +267,7 @@ class ComplexDeinterleavingGraph { /// intrinsic (for both fixed and scalable vectors) NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag); - Value *replaceNode(RawNodePtr Node); + Value *replaceNode(IRBuilderBase &Builder, RawNodePtr Node); public: void dump() { dump(dbgs()); } @@ -1011,7 +1011,8 @@ ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real, return submitCompositeNode(PlaceholderNode); } -static Value *replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node, +static Value *replaceSymmetricNode(IRBuilderBase &B, + ComplexDeinterleavingGraph::RawNodePtr Node, Value *InputA, Value *InputB) { Instruction *I = Node->Real; if (I->isUnaryOp()) @@ -1021,8 +1022,6 @@ static Value *replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node, assert(InputB && "Binary symmetric operations need two inputs, only one " "was provided."); - IRBuilder<> B(I); - switch (I->getOpcode()) { case Instruction::FNeg: return B.CreateFNegFMF(InputA, I); @@ -1037,27 +1036,28 @@ static Value *replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node, return nullptr; } -Value *ComplexDeinterleavingGraph::replaceNode( - ComplexDeinterleavingGraph::RawNodePtr Node) { +Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, + RawNodePtr Node) { if (Node->ReplacementNode) return Node->ReplacementNode; - Value *Input0 = replaceNode(Node->Operands[0]); - Value *Input1 = - Node->Operands.size() > 1 ? replaceNode(Node->Operands[1]) : nullptr; - Value *Accumulator = - Node->Operands.size() > 2 ? replaceNode(Node->Operands[2]) : nullptr; + Value *Input0 = replaceNode(Builder, Node->Operands[0]); + Value *Input1 = Node->Operands.size() > 1 + ? replaceNode(Builder, Node->Operands[1]) + : nullptr; + Value *Accumulator = Node->Operands.size() > 2 + ? replaceNode(Builder, Node->Operands[2]) + : nullptr; if (Input1) assert(Input0->getType() == Input1->getType() && "Node inputs need to be of the same type"); if (Node->Operation == ComplexDeinterleavingOperation::Symmetric) - Node->ReplacementNode = replaceSymmetricNode(Node, Input0, Input1); + Node->ReplacementNode = replaceSymmetricNode(Builder, Node, Input0, Input1); else Node->ReplacementNode = TL->createComplexDeinterleavingIR( - Node->Real, Node->Operation, Node->Rotation, Input0, Input1, - Accumulator); + Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); assert(Node->ReplacementNode && "Target failed to create Intrinsic call."); NumComplexTransformations += 1; @@ -1074,7 +1074,7 @@ void ComplexDeinterleavingGraph::replaceNodes() { IRBuilder<> Builder(RootInstruction); auto RootNode = RootToNode[RootInstruction]; - Value *R = replaceNode(RootNode.get()); + Value *R = replaceNode(Builder, RootNode.get()); assert(R && "Unable to find replacement for RootInstruction"); DeadInstrRoots.push_back(RootInstruction); RootInstruction->replaceAllUsesWith(R); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0a628fc402d69..b8ae8a034e54c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -25286,14 +25286,12 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( } Value *AArch64TargetLowering::createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { VectorType *Ty = cast(InputA->getType()); bool IsScalable = Ty->isScalableTy(); - IRBuilder<> B(I); - unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue(); @@ -25317,9 +25315,9 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride)); } auto *LowerSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt, B.getInt64(0)); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 689c2d1860064..cf766a74d6949 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -843,7 +843,7 @@ class AArch64TargetLowering : public TargetLowering { ComplexDeinterleavingOperation Operation, Type *Ty) const override; Value *createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator = nullptr) const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9eab7b0e53d12..9cde9205335fd 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -22060,14 +22060,12 @@ bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( } Value *ARMTargetLowering::createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { FixedVectorType *Ty = cast(InputA->getType()); - IRBuilder<> B(I); - unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits"); @@ -22092,9 +22090,9 @@ Value *ARMTargetLowering::createComplexDeinterleavingIR( } auto *LowerSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); ArrayRef JoinMask(&SplitSeqVec[0], Ty->getNumElements()); return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 49fc5a50686a1..2dd54602ef61b 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -750,7 +750,7 @@ class VectorType; ComplexDeinterleavingOperation Operation, Type *Ty) const override; Value *createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator = nullptr) const override; diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll index 9aa6a856bc02c..65012899c97e3 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll @@ -220,11 +220,11 @@ define <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: movi v4.2d, #0000000000000000 -; CHECK-NEXT: fcmla v3.4s, v1.4s, v0.4s, #0 -; CHECK-NEXT: fcmla v4.4s, v2.4s, v0.4s, #0 -; CHECK-NEXT: fcmla v3.4s, v1.4s, v0.4s, #90 -; CHECK-NEXT: fcmla v4.4s, v2.4s, v0.4s, #90 -; CHECK-NEXT: fcadd v0.4s, v4.4s, v3.4s, #90 +; CHECK-NEXT: fcmla v3.4s, v2.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v4.4s, v1.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v2.4s, v0.4s, #90 +; CHECK-NEXT: fcmla v4.4s, v1.4s, v0.4s, #90 +; CHECK-NEXT: fcadd v0.4s, v3.4s, v4.4s, #90 ; CHECK-NEXT: ret entry: %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> From d6e1909526fea196e20566d7d66c1b6ca04fa9e5 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Tue, 30 May 2023 09:21:16 -0700 Subject: [PATCH 099/704] Mark header as textual --- clang/include/module.modulemap | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap index 9fea3620100f4..6ea613c70306f 100644 --- a/clang/include/module.modulemap +++ b/clang/include/module.modulemap @@ -55,6 +55,7 @@ module Clang_Basic { textual header "clang/Basic/BuiltinsPPC.def" textual header "clang/Basic/BuiltinsRISCV.def" textual header "clang/Basic/BuiltinsRISCVVector.def" + textual header "clang/Basic/BuiltinsSME.def" textual header "clang/Basic/BuiltinsSVE.def" textual header "clang/Basic/BuiltinsSystemZ.def" textual header "clang/Basic/BuiltinsVE.def" From 0ec79f413e3a292063ca047b520b5b9b592cdc0c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 30 May 2023 17:21:44 +0100 Subject: [PATCH 100/704] [X86] Regenerate sqrt-fastmath-mir.ll --- llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll | 188 +++++++++++---------- 1 file changed, 97 insertions(+), 91 deletions(-) diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll index d3715f2eac164..8a7fea78702d8 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -6,12 +6,13 @@ declare float @llvm.sqrt.f32(float) #2 define float @sqrt_ieee(float %f) #0 { ; CHECK-LABEL: name: sqrt_ieee ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: %1:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr - ; CHECK: $xmm0 = COPY %1 - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] + ; CHECK-NEXT: RET 0, $xmm0 %call = tail call float @llvm.sqrt.f32(float %f) ret float %call } @@ -19,31 +20,32 @@ define float @sqrt_ieee(float %f) #0 { define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-LABEL: name: sqrt_ieee_ninf ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK: %5:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK: %7:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = ninf afn nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = ninf afn nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = ninf afn nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = ninf afn nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr - ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %12 - ; CHECK: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]] - ; CHECK: [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load (s32) from constant-pool) - ; CHECK: [[VPANDrr:%[0-9]+]]:vr128 = VPANDrr killed [[COPY2]], killed [[VPBROADCASTDrm]] - ; CHECK: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDrr]] - ; CHECK: %18:fr32 = nofpexcept VCMPSSrm killed [[COPY3]], $rip, 1, $noreg, %const.3, $noreg, 1, implicit $mxcsr :: (load (s32) from constant-pool) - ; CHECK: [[COPY4:%[0-9]+]]:vr128 = COPY %18 - ; CHECK: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY4]], killed [[COPY1]] - ; CHECK: [[COPY5:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] - ; CHECK: $xmm0 = COPY [[COPY5]] - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY [[VMULSSrr5]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]] + ; CHECK-NEXT: [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VPANDrr:%[0-9]+]]:vr128 = VPANDrr killed [[COPY2]], killed [[VPBROADCASTDrm]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDrr]] + ; CHECK-NEXT: [[VCMPSSrm:%[0-9]+]]:fr32 = nofpexcept VCMPSSrm killed [[COPY3]], $rip, 1, $noreg, %const.3, $noreg, 1, implicit $mxcsr :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vr128 = COPY [[VCMPSSrm]] + ; CHECK-NEXT: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY4]], killed [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] + ; CHECK-NEXT: $xmm0 = COPY [[COPY5]] + ; CHECK-NEXT: RET 0, $xmm0 %call = tail call ninf afn float @llvm.sqrt.f32(float %f) ret float %call } @@ -51,12 +53,13 @@ define float @sqrt_ieee_ninf(float %f) #0 { define float @sqrt_daz(float %f) #1 { ; CHECK-LABEL: name: sqrt_daz ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: %1:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr - ; CHECK: $xmm0 = COPY %1 - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] + ; CHECK-NEXT: RET 0, $xmm0 %call = tail call float @llvm.sqrt.f32(float %f) ret float %call } @@ -64,28 +67,29 @@ define float @sqrt_daz(float %f) #1 { define float @sqrt_daz_ninf(float %f) #1 { ; CHECK-LABEL: name: sqrt_daz_ninf ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK: %5:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK: %7:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = ninf afn nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = ninf afn nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = ninf afn nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = ninf afn nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr - ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %12 - ; CHECK: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS - ; CHECK: %15:fr32 = nofpexcept VCMPSSrr [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr - ; CHECK: [[COPY2:%[0-9]+]]:vr128 = COPY %15 - ; CHECK: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY2]], killed [[COPY1]] - ; CHECK: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] - ; CHECK: $xmm0 = COPY [[COPY3]] - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY [[VMULSSrr5]] + ; CHECK-NEXT: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS + ; CHECK-NEXT: [[VCMPSSrr:%[0-9]+]]:fr32 = nofpexcept VCMPSSrr [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[VCMPSSrr]] + ; CHECK-NEXT: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY2]], killed [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] + ; CHECK-NEXT: $xmm0 = COPY [[COPY3]] + ; CHECK-NEXT: RET 0, $xmm0 %call = tail call ninf afn float @llvm.sqrt.f32(float %f) ret float %call } @@ -93,22 +97,23 @@ define float @sqrt_daz_ninf(float %f) #1 { define float @rsqrt_ieee(float %f) #0 { ; CHECK-LABEL: name: rsqrt_ieee ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK: %5:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK: %7:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr %8, killed %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr %8, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr - ; CHECK: $xmm0 = COPY %12 - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VMULSSrr2]], killed [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VMULSSrr2]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VMULSSrr5]] + ; CHECK-NEXT: RET 0, $xmm0 %sqrt = tail call float @llvm.sqrt.f32(float %f) %div = fdiv fast float 1.0, %sqrt ret float %div @@ -117,22 +122,23 @@ define float @rsqrt_ieee(float %f) #0 { define float @rsqrt_daz(float %f) #1 { ; CHECK-LABEL: name: rsqrt_daz ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK: %5:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK: %7:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr %8, killed %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr %8, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr - ; CHECK: $xmm0 = COPY %12 - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VMULSSrr2]], killed [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VMULSSrr2]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VMULSSrr5]] + ; CHECK-NEXT: RET 0, $xmm0 %sqrt = tail call float @llvm.sqrt.f32(float %f) %div = fdiv fast float 1.0, %sqrt ret float %div From 0989ce947e3dd64f3e29e7c34f186c0a23c0323d Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Tue, 30 May 2023 18:17:10 +0200 Subject: [PATCH 101/704] [clang][analyzer][NFC] Move dyn_cast's into if statements for readability Reviewed By: steakhal Differential Revision: https://reviews.llvm.org/D151725 --- clang/lib/StaticAnalyzer/Core/MemRegion.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp index 0c126a632f745..bb45a879471ae 100644 --- a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp +++ b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp @@ -712,21 +712,17 @@ std::string MemRegion::getDescriptiveName(bool UseQuotes) const { } SourceRange MemRegion::sourceRange() const { - const auto *const VR = dyn_cast(this->getBaseRegion()); - const auto *const FR = dyn_cast(this); - // Check for more specific regions first. - // FieldRegion - if (FR) { + if (auto *FR = dyn_cast(this)) { return FR->getDecl()->getSourceRange(); } - // VarRegion - else if (VR) { + + if (auto *VR = dyn_cast(this->getBaseRegion())) { return VR->getDecl()->getSourceRange(); } + // Return invalid source range (can be checked by client). - else - return {}; + return {}; } //===----------------------------------------------------------------------===// From 8a40f89e2e9357539db0f9a119db0fcfc77232ab Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Tue, 30 May 2023 18:23:11 +0200 Subject: [PATCH 102/704] [clang][analyzer][NFC] Replace dyn_cast with cast in MemRegion::getMemorySpace MemRegion::getMemorySpace() is annotated with LLVM_ATTRIBUTE_RETURNS_NONNULL (which triggers instant UB if a null pointer is returned), and callers indeed don't check the return value for null. Thus, even though llvm::dyn_cast is called, it can never return null in this context. Therefore, we can safely call llvm::cast. Reviewed By: steakhal Differential Revision: https://reviews.llvm.org/D151727 --- clang/lib/StaticAnalyzer/Core/MemRegion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp index bb45a879471ae..d7409ae6aebe8 100644 --- a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp +++ b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp @@ -1279,7 +1279,7 @@ const MemSpaceRegion *MemRegion::getMemorySpace() const { SR = dyn_cast(R); } - return dyn_cast(R); + return cast(R); } bool MemRegion::hasStackStorage() const { From d951c6a533630b808d4cd733d4362f8e21661a82 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Thu, 25 May 2023 21:38:07 +0200 Subject: [PATCH 103/704] [libc++][CI] Installs libomp. This is needed to build clang-tidy plugins using clang-tidy 17. Reviewed By: #libc, ldionne Differential Revision: https://reviews.llvm.org/D151488 --- libcxx/utils/ci/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile index acceee3e74c97..6e9bbfb06bed3 100644 --- a/libcxx/utils/ci/Dockerfile +++ b/libcxx/utils/ci/Dockerfile @@ -93,7 +93,8 @@ RUN apt-get update && apt-get install -y clang-tidy-$(($LLVM_HEAD_VERSION - 2)) # TODO(LLVM-17) revert D148831 to only install $(($LLVM_HEAD_VERSION - 1)) and $LLVM_HEAD_VERSION # The usage of the ToT version is needed due to module issues with Clang 16 RUN apt-get update && apt-get install -y llvm-$(($LLVM_HEAD_VERSION - 2))-dev llvm-$(($LLVM_HEAD_VERSION - 1))-dev llvm-$LLVM_HEAD_VERSION-dev \ - libclang-$(($LLVM_HEAD_VERSION - 2))-dev libclang-$(($LLVM_HEAD_VERSION - 1))-dev libclang-$LLVM_HEAD_VERSION-dev + libclang-$(($LLVM_HEAD_VERSION - 2))-dev libclang-$(($LLVM_HEAD_VERSION - 1))-dev libclang-$LLVM_HEAD_VERSION-dev \ + libomp5-$LLVM_HEAD_VERSION # Install the most recent GCC, like clang install the previous version as a transition. ENV GCC_LATEST_VERSION=13 From 5e98dbff729877344736ffa083bfc57e1b787bd1 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 30 May 2023 17:45:16 +0100 Subject: [PATCH 104/704] [AArch64] Add i1 insert/extract cost tests. NFC See D151189. The existing files check lines have also been adjusted whilst here. --- .../CostModel/AArch64/insert-extract.ll | 124 ++++++++++-------- .../CostModel/AArch64/sve-insert-extract.ll | 20 +++ 2 files changed, 89 insertions(+), 55 deletions(-) diff --git a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll index 870b6631af242..ebffcec340b3c 100644 --- a/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll +++ b/llvm/test/Analysis/CostModel/AArch64/insert-extract.ll @@ -1,15 +1,53 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mcpu=neoverse-n1 | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mcpu=neoverse-n2 | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mcpu=neoverse-v1 | FileCheck %s +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mcpu=neoverse-v2 | FileCheck %s ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mcpu=kryo | FileCheck %s --check-prefix=KRYO -; RUN: opt < %s -passes="print" 2>&1 -disable-output -mcpu=neoverse-n1 | FileCheck %s --check-prefix=NEO -; RUN: opt < %s -passes="print" 2>&1 -disable-output -mcpu=neoverse-n2 | FileCheck %s --check-prefix=NEO -; RUN: opt < %s -passes="print" 2>&1 -disable-output -mcpu=neoverse-v1 | FileCheck %s --check-prefix=NEO -; RUN: opt < %s -passes="print" 2>&1 -disable-output -mcpu=neoverse-v2 | FileCheck %s --check-prefix=NEO target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" define void @vectorInstrCost() { +; CHECK-LABEL: 'vectorInstrCost' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ta0 = extractelement <8 x i1> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ta1 = extractelement <8 x i1> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t1 = extractelement <8 x i8> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t2 = extractelement <8 x i8> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t3 = extractelement <4 x i16> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t4 = extractelement <4 x i16> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t5 = extractelement <2 x i32> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t6 = extractelement <2 x i32> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t7 = extractelement <2 x i64> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t8 = extractelement <2 x i64> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t9 = extractelement <4 x half> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t10 = extractelement <4 x half> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t11 = extractelement <2 x float> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t12 = extractelement <2 x float> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t13 = extractelement <2 x double> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t14 = extractelement <2 x double> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t31 = insertelement <8 x i1> undef, i1 false, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t41 = insertelement <8 x i1> undef, i1 true, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t30 = insertelement <8 x i8> undef, i8 0, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t40 = insertelement <8 x i8> undef, i8 1, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t50 = insertelement <4 x i16> undef, i16 2, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t60 = insertelement <4 x i16> undef, i16 3, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t70 = insertelement <2 x i32> undef, i32 4, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t80 = insertelement <2 x i32> undef, i32 5, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t90 = insertelement <2 x i64> undef, i64 6, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t100 = insertelement <2 x i64> undef, i64 7, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t110 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t130 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t150 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; ; KRYO-LABEL: 'vectorInstrCost' +; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ta0 = extractelement <8 x i1> undef, i32 0 +; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ta1 = extractelement <8 x i1> undef, i32 1 ; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t1 = extractelement <8 x i8> undef, i32 0 ; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t2 = extractelement <8 x i8> undef, i32 1 ; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t3 = extractelement <4 x i16> undef, i32 0 @@ -24,6 +62,8 @@ define void @vectorInstrCost() { ; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t12 = extractelement <2 x float> undef, i32 1 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t13 = extractelement <2 x double> undef, i32 0 ; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t14 = extractelement <2 x double> undef, i32 1 +; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t31 = insertelement <8 x i1> undef, i1 false, i32 0 +; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t41 = insertelement <8 x i1> undef, i1 true, i32 1 ; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t30 = insertelement <8 x i8> undef, i8 0, i32 0 ; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t40 = insertelement <8 x i8> undef, i8 1, i32 1 ; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t50 = insertelement <4 x i16> undef, i16 2, i32 0 @@ -40,37 +80,9 @@ define void @vectorInstrCost() { ; KRYO-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; -; NEO-LABEL: 'vectorInstrCost' -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t1 = extractelement <8 x i8> undef, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t2 = extractelement <8 x i8> undef, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t3 = extractelement <4 x i16> undef, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t4 = extractelement <4 x i16> undef, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t5 = extractelement <2 x i32> undef, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t6 = extractelement <2 x i32> undef, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t7 = extractelement <2 x i64> undef, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t8 = extractelement <2 x i64> undef, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t9 = extractelement <4 x half> undef, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t10 = extractelement <4 x half> undef, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t11 = extractelement <2 x float> undef, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t12 = extractelement <2 x float> undef, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t13 = extractelement <2 x double> undef, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t14 = extractelement <2 x double> undef, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t30 = insertelement <8 x i8> undef, i8 0, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t40 = insertelement <8 x i8> undef, i8 1, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t50 = insertelement <4 x i16> undef, i16 2, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t60 = insertelement <4 x i16> undef, i16 3, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t70 = insertelement <2 x i32> undef, i32 4, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t80 = insertelement <2 x i32> undef, i32 5, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t90 = insertelement <2 x i64> undef, i64 6, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t100 = insertelement <2 x i64> undef, i64 7, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t110 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t130 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t150 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void -; + + %ta0 = extractelement <8 x i1> undef, i32 0 + %ta1 = extractelement <8 x i1> undef, i32 1 %t1 = extractelement <8 x i8> undef, i32 0 %t2 = extractelement <8 x i8> undef, i32 1 %t3 = extractelement <4 x i16> undef, i32 0 @@ -86,6 +98,8 @@ define void @vectorInstrCost() { %t13 = extractelement <2 x double> undef, i32 0 %t14 = extractelement <2 x double> undef, i32 1 + %t31 = insertelement <8 x i1> undef, i1 0, i32 0 + %t41 = insertelement <8 x i1> undef, i1 1, i32 1 %t30 = insertelement <8 x i8> undef, i8 0, i32 0 %t40 = insertelement <8 x i8> undef, i8 1, i32 1 %t50 = insertelement <4 x i16> undef, i16 2, i32 0 @@ -106,16 +120,16 @@ define void @vectorInstrCost() { ;; LD1: Load one single-element structure to one lane of one register. define <8 x i8> @LD1_B(<8 x i8> %vec, ptr noundef %i) { +; CHECK-LABEL: 'LD1_B' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2 +; ; KRYO-LABEL: 'LD1_B' ; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1 ; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2 ; -; NEO-LABEL: 'LD1_B' -; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2 -; entry: %v1 = load i8, ptr %i, align 1 %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 @@ -123,16 +137,16 @@ entry: } define <4 x i16> @LD1_H(<4 x i16> %vec, ptr noundef %i) { +; CHECK-LABEL: 'LD1_H' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2 +; ; KRYO-LABEL: 'LD1_H' ; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2 ; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2 ; -; NEO-LABEL: 'LD1_H' -; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2 -; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2 -; entry: %v1 = load i16, ptr %i, align 2 %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 @@ -140,16 +154,16 @@ entry: } define <4 x i32> @LD1_W(<4 x i32> %vec, ptr noundef %i) { +; CHECK-LABEL: 'LD1_W' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2 +; ; KRYO-LABEL: 'LD1_W' ; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4 ; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2 ; -; NEO-LABEL: 'LD1_W' -; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4 -; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2 -; entry: %v1 = load i32, ptr %i, align 4 %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 @@ -157,16 +171,16 @@ entry: } define <2 x i64> @LD1_X(<2 x i64> %vec, ptr noundef %i) { +; CHECK-LABEL: 'LD1_X' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2 +; ; KRYO-LABEL: 'LD1_X' ; KRYO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8 ; KRYO-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 ; KRYO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2 ; -; NEO-LABEL: 'LD1_X' -; NEO-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8 -; NEO-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 -; NEO-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2 -; entry: %v1 = load i64, ptr %i, align 8 %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll index ad79609b94660..f6910cfd6a5e5 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll @@ -9,6 +9,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @ins_el0() #0 { ; CHECK-DEFAULT-LABEL: 'ins_el0' +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 0 @@ -18,6 +19,7 @@ define void @ins_el0() #0 { ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-LOW-LABEL: 'ins_el0' +; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 0 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 0 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 0 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 0 @@ -27,6 +29,7 @@ define void @ins_el0() #0 { ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-HIGH-LABEL: 'ins_el0' +; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 0 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 0 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 0 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 0 @@ -35,6 +38,7 @@ define void @ins_el0() #0 { ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 0 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %vi1 = insertelement zeroinitializer, i1 0, i64 0 %v0 = insertelement zeroinitializer, i8 0, i64 0 %v1 = insertelement zeroinitializer, i16 0, i64 0 %v2 = insertelement zeroinitializer, i32 0, i64 0 @@ -46,6 +50,7 @@ define void @ins_el0() #0 { define void @ins_el1() #0 { ; CHECK-DEFAULT-LABEL: 'ins_el1' +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 1 @@ -55,6 +60,7 @@ define void @ins_el1() #0 { ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-LOW-LABEL: 'ins_el1' +; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 1 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 1 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 1 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 1 @@ -64,6 +70,7 @@ define void @ins_el1() #0 { ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-HIGH-LABEL: 'ins_el1' +; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 1 @@ -72,6 +79,7 @@ define void @ins_el1() #0 { ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %vi1 = insertelement zeroinitializer, i1 0, i64 1 %v0 = insertelement zeroinitializer, i8 0, i64 1 %v1 = insertelement zeroinitializer, i16 0, i64 1 %v2 = insertelement zeroinitializer, i32 0, i64 1 @@ -84,6 +92,7 @@ define void @ins_el1() #0 { define void @ext_el0() #0 { ; CHECK-DEFAULT-LABEL: 'ext_el0' +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement zeroinitializer, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement zeroinitializer, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement zeroinitializer, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement zeroinitializer, i64 0 @@ -93,6 +102,7 @@ define void @ext_el0() #0 { ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-LOW-LABEL: 'ext_el0' +; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vi1 = extractelement zeroinitializer, i64 0 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement zeroinitializer, i64 0 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement zeroinitializer, i64 0 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement zeroinitializer, i64 0 @@ -102,6 +112,7 @@ define void @ext_el0() #0 { ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-HIGH-LABEL: 'ext_el0' +; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %vi1 = extractelement zeroinitializer, i64 0 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement zeroinitializer, i64 0 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement zeroinitializer, i64 0 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement zeroinitializer, i64 0 @@ -110,6 +121,7 @@ define void @ext_el0() #0 { ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement zeroinitializer, i64 0 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %vi1 = extractelement zeroinitializer, i64 0 %v0 = extractelement zeroinitializer, i64 0 %v1 = extractelement zeroinitializer, i64 0 %v2 = extractelement zeroinitializer, i64 0 @@ -121,6 +133,7 @@ define void @ext_el0() #0 { define void @ext_el1() #0 { ; CHECK-DEFAULT-LABEL: 'ext_el1' +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement zeroinitializer, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement zeroinitializer, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement zeroinitializer, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement zeroinitializer, i64 1 @@ -130,6 +143,7 @@ define void @ext_el1() #0 { ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-LOW-LABEL: 'ext_el1' +; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vi1 = extractelement zeroinitializer, i64 1 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement zeroinitializer, i64 1 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement zeroinitializer, i64 1 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement zeroinitializer, i64 1 @@ -139,6 +153,7 @@ define void @ext_el1() #0 { ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-HIGH-LABEL: 'ext_el1' +; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %vi1 = extractelement zeroinitializer, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement zeroinitializer, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement zeroinitializer, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement zeroinitializer, i64 1 @@ -147,6 +162,7 @@ define void @ext_el1() #0 { ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v5 = extractelement zeroinitializer, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %vi1 = extractelement zeroinitializer, i64 1 %v0 = extractelement zeroinitializer, i64 1 %v1 = extractelement zeroinitializer, i64 1 %v2 = extractelement zeroinitializer, i64 1 @@ -160,6 +176,7 @@ define void @ext_el1() #0 { ; Test the behaviour in the presence of a CPU-specific override in AArch64Subtarget (via attribute set). define void @test_override_cpu_given() #1 { ; CHECK-DEFAULT-LABEL: 'test_override_cpu_given' +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vi1 = extractelement zeroinitializer, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement zeroinitializer, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement zeroinitializer, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement zeroinitializer, i64 1 @@ -169,6 +186,7 @@ define void @test_override_cpu_given() #1 { ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-LOW-LABEL: 'test_override_cpu_given' +; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %vi1 = extractelement zeroinitializer, i64 1 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement zeroinitializer, i64 1 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement zeroinitializer, i64 1 ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement zeroinitializer, i64 1 @@ -178,6 +196,7 @@ define void @test_override_cpu_given() #1 { ; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-HIGH-LABEL: 'test_override_cpu_given' +; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %vi1 = extractelement zeroinitializer, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement zeroinitializer, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement zeroinitializer, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement zeroinitializer, i64 1 @@ -186,6 +205,7 @@ define void @test_override_cpu_given() #1 { ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v5 = extractelement zeroinitializer, i64 1 ; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; + %vi1 = extractelement zeroinitializer, i64 1 %v0 = extractelement zeroinitializer, i64 1 %v1 = extractelement zeroinitializer, i64 1 %v2 = extractelement zeroinitializer, i64 1 From 93f8554e65449903a4de4bae4b2f6a09b4129fb6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 30 May 2023 09:51:49 -0700 Subject: [PATCH 105/704] [RISCV] Correct capitalization of SiFive in a comment. NFC --- clang/include/clang/Sema/Sema.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 7447755ef2df2..012a3aa93fcdc 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -1624,7 +1624,7 @@ class Sema final { /// Indicate RISC-V vector builtin functions enabled or not. bool DeclareRISCVVBuiltins = false; - /// Indicate RISC-V Sifive vector builtin functions enabled or not. + /// Indicate RISC-V SiFive vector builtin functions enabled or not. bool DeclareRISCVVectorBuiltins = false; private: From 8c5ad4a0e2970e2954f262d487f27737491f8f5c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 May 2023 18:52:16 +0200 Subject: [PATCH 106/704] Fix "[compiler-rt] Refactor memintrinsic interceptors" Fix the Fuchsia build. asan_interceptors_memintrinsics.cpp should not include any interceptors on Fuchsia. Reported-by: haowei Link: https://logs.chromium.org/logs/fuchsia/buildbucket/cr-buildbucket/8779679021892159153/+/u/clang/build/stdout --- compiler-rt/lib/asan/asan_interceptors.h | 4 +- .../asan/asan_interceptors_memintrinsics.cpp | 42 ++++++++++--------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/compiler-rt/lib/asan/asan_interceptors.h b/compiler-rt/lib/asan/asan_interceptors.h index 087189dc1f4d8..268096fea5e7e 100644 --- a/compiler-rt/lib/asan/asan_interceptors.h +++ b/compiler-rt/lib/asan/asan_interceptors.h @@ -166,12 +166,12 @@ DECLARE_REAL(char*, strstr, const char *s1, const char *s2) # define ASAN_INTERCEPT_FUNC(name) # endif // SANITIZER_APPLE -#endif // !SANITIZER_FUCHSIA - #define ASAN_INTERCEPTOR_ENTER(ctx, func) \ AsanInterceptorContext _ctx = {#func}; \ ctx = (void *)&_ctx; \ (void) ctx; #define COMMON_INTERCEPT_FUNCTION(name) ASAN_INTERCEPT_FUNC(name) +#endif // !SANITIZER_FUCHSIA + #endif // ASAN_INTERCEPTORS_H diff --git a/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp b/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp index 83bb9fbcad4fd..9d1452482d3af 100644 --- a/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp +++ b/compiler-rt/lib/asan/asan_interceptors_memintrinsics.cpp @@ -57,26 +57,6 @@ using namespace __asan; return internal_memmove(to, from, size); \ } while (0) -#define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size) \ - do { \ - ASAN_INTERCEPTOR_ENTER(ctx, memmove); \ - ASAN_MEMMOVE_IMPL(ctx, to, from, size); \ - } while (false) - -#define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size) \ - do { \ - ASAN_INTERCEPTOR_ENTER(ctx, memcpy); \ - ASAN_MEMCPY_IMPL(ctx, to, from, size); \ - } while (false) - -#define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size) \ - do { \ - ASAN_INTERCEPTOR_ENTER(ctx, memset); \ - ASAN_MEMSET_IMPL(ctx, block, c, size); \ - } while (false) - -#include "sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc" - void *__asan_memcpy(void *to, const void *from, uptr size) { ASAN_MEMCPY_IMPL(nullptr, to, from, size); } @@ -99,4 +79,26 @@ extern "C" decltype(__asan_memcpy) memcpy[[gnu::alias("__asan_memcpy")]]; extern "C" decltype(__asan_memmove) memmove[[gnu::alias("__asan_memmove")]]; extern "C" decltype(__asan_memset) memset[[gnu::alias("__asan_memset")]]; +#else // SANITIZER_FUCHSIA + +#define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size) \ + do { \ + ASAN_INTERCEPTOR_ENTER(ctx, memmove); \ + ASAN_MEMMOVE_IMPL(ctx, to, from, size); \ + } while (false) + +#define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size) \ + do { \ + ASAN_INTERCEPTOR_ENTER(ctx, memcpy); \ + ASAN_MEMCPY_IMPL(ctx, to, from, size); \ + } while (false) + +#define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size) \ + do { \ + ASAN_INTERCEPTOR_ENTER(ctx, memset); \ + ASAN_MEMSET_IMPL(ctx, block, c, size); \ + } while (false) + +#include "sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc" + #endif // SANITIZER_FUCHSIA From 66c7388c83bb0320f2223da3e3f31363f44bb3aa Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Thu, 25 May 2023 20:02:50 +0200 Subject: [PATCH 107/704] [libc++] Deprecate the classes in strstream. These have been deprecated since their initial version in libc++. It seems they were never properly marked as deprecated. Discovered while working on D151223. Reviewed By: #libc, ldionne, philnik Differential Revision: https://reviews.llvm.org/D151474 --- libcxx/docs/ReleaseNotes.rst | 3 +++ libcxx/include/strstream | 8 ++++---- .../std/depr/depr.ios.members/io_state.pass.cpp | 6 ++++-- .../std/depr/depr.ios.members/open_mode.pass.cpp | 6 ++++-- .../std/depr/depr.ios.members/seek_dir.pass.cpp | 6 ++++-- .../depr.istrstream.cons/ccp.pass.cpp | 2 ++ .../depr.istrstream.cons/ccp_size.pass.cpp | 2 ++ .../depr.istrstream.cons/cp.pass.cpp | 2 ++ .../depr.istrstream.cons/cp_size.pass.cpp | 2 ++ .../depr.istrstream.members/rdbuf.pass.cpp | 2 ++ .../depr.istrstream.members/str.pass.cpp | 2 ++ .../depr.istrstream/depr.verify.cpp | 15 +++++++++++++++ .../depr.istrstream/types.pass.cpp | 2 ++ .../depr.ostrstream.cons/cp_size_mode.pass.cpp | 2 ++ .../depr.ostrstream.cons/default.pass.cpp | 2 ++ .../depr.ostrstream.members/freeze.pass.cpp | 2 ++ .../depr.ostrstream.members/pcount.pass.cpp | 2 ++ .../depr.ostrstream.members/rdbuf.pass.cpp | 2 ++ .../depr.ostrstream.members/str.pass.cpp | 2 ++ .../depr.ostrstream/depr.verify.cpp | 15 +++++++++++++++ .../depr.ostrstream/types.pass.cpp | 2 ++ .../depr.strstream.cons/cp_size_mode.pass.cpp | 2 ++ .../depr.strstream.cons/default.pass.cpp | 2 ++ .../depr.strstream.dest/rdbuf.pass.cpp | 2 ++ .../depr.strstream.oper/freeze.pass.cpp | 2 ++ .../depr.strstream.oper/pcount.pass.cpp | 2 ++ .../depr.strstream.oper/str.pass.cpp | 2 ++ .../depr.strstream/depr.verify.cpp | 15 +++++++++++++++ .../depr.strstream/types.pass.cpp | 2 ++ .../depr.strstreambuf.cons/ccp_size.pass.cpp | 2 ++ .../depr.strstreambuf.cons/cp_size_cp.pass.cpp | 2 ++ .../depr.strstreambuf.cons/cscp_size.pass.cpp | 2 ++ .../depr.strstreambuf.cons/cucp_size.pass.cpp | 2 ++ .../depr.strstreambuf.cons/custom_alloc.pass.cpp | 2 ++ .../depr.strstreambuf.cons/default.pass.cpp | 2 ++ .../depr.strstreambuf.cons/scp_size_scp.pass.cpp | 2 ++ .../depr.strstreambuf.cons/ucp_size_ucp.pass.cpp | 2 ++ .../depr.strstreambuf.members/freeze.pass.cpp | 2 ++ .../depr.strstreambuf.members/overflow.pass.cpp | 2 ++ .../depr.strstreambuf.members/pcount.pass.cpp | 2 ++ .../depr.strstreambuf.members/str.pass.cpp | 2 ++ .../depr.strstreambuf.virtuals/overflow.pass.cpp | 2 ++ .../depr.strstreambuf.virtuals/pbackfail.pass.cpp | 2 ++ .../depr.strstreambuf.virtuals/seekoff.pass.cpp | 2 ++ .../depr.strstreambuf.virtuals/seekpos.pass.cpp | 2 ++ .../depr.strstreambuf.virtuals/setbuf.pass.cpp | 2 ++ .../depr.strstreambuf.virtuals/underflow.pass.cpp | 2 ++ .../depr.strstreambuf/depr.verify.cpp | 15 +++++++++++++++ .../depr.strstreambuf/types.pass.cpp | 2 ++ 49 files changed, 159 insertions(+), 10 deletions(-) create mode 100644 libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp create mode 100644 libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp create mode 100644 libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp create mode 100644 libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst index 56e856008cb38..2f040ed1d8d3a 100644 --- a/libcxx/docs/ReleaseNotes.rst +++ b/libcxx/docs/ReleaseNotes.rst @@ -105,6 +105,9 @@ Deprecations and Removals - The ``_LIBCPP_ABI_OLD_LOGNORMAL_DISTRIBUTION`` macro has been removed. +- The classes ``strstreambuf`` , ``istrstream``, ``ostrstream``, and ``strstream`` have been deprecated. + They have been deprecated in the Standard since C++98, but were never marked as deprecated in libc++. + Upcoming Deprecations and Removals ---------------------------------- diff --git a/libcxx/include/strstream b/libcxx/include/strstream index d0688fc4da19e..01590445fea68 100644 --- a/libcxx/include/strstream +++ b/libcxx/include/strstream @@ -141,7 +141,7 @@ private: _LIBCPP_BEGIN_NAMESPACE_STD -class _LIBCPP_TYPE_VIS strstreambuf +class _LIBCPP_DEPRECATED _LIBCPP_TYPE_VIS strstreambuf : public streambuf { public: @@ -237,7 +237,7 @@ strstreambuf::operator=(strstreambuf&& __rhs) #endif // _LIBCPP_CXX03_LANG -class _LIBCPP_TYPE_VIS istrstream +class _LIBCPP_DEPRECATED _LIBCPP_TYPE_VIS istrstream : public istream { public: @@ -290,7 +290,7 @@ private: strstreambuf __sb_; }; -class _LIBCPP_TYPE_VIS ostrstream +class _LIBCPP_DEPRECATED _LIBCPP_TYPE_VIS ostrstream : public ostream { public: @@ -343,7 +343,7 @@ private: strstreambuf __sb_; // exposition only }; -class _LIBCPP_TYPE_VIS strstream +class _LIBCPP_DEPRECATED _LIBCPP_TYPE_VIS strstream : public iostream { public: diff --git a/libcxx/test/std/depr/depr.ios.members/io_state.pass.cpp b/libcxx/test/std/depr/depr.ios.members/io_state.pass.cpp index 37b23fc5e8c34..9ed8476835793 100644 --- a/libcxx/test/std/depr/depr.ios.members/io_state.pass.cpp +++ b/libcxx/test/std/depr/depr.ios.members/io_state.pass.cpp @@ -6,6 +6,10 @@ // //===----------------------------------------------------------------------===// +// REQUIRES: c++03 || c++11 || c++14 + +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // // class ios_base @@ -22,10 +26,8 @@ int main(int, char**) { -#if TEST_STD_VER <= 14 std::strstream::io_state b = std::strstream::eofbit; assert(b == std::ios::eofbit); -#endif return 0; } diff --git a/libcxx/test/std/depr/depr.ios.members/open_mode.pass.cpp b/libcxx/test/std/depr/depr.ios.members/open_mode.pass.cpp index 57a88c957673a..de536c74c8dc0 100644 --- a/libcxx/test/std/depr/depr.ios.members/open_mode.pass.cpp +++ b/libcxx/test/std/depr/depr.ios.members/open_mode.pass.cpp @@ -6,6 +6,10 @@ // //===----------------------------------------------------------------------===// +// REQUIRES: c++03 || c++11 || c++14 + +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // // class ios_base @@ -22,10 +26,8 @@ int main(int, char**) { -#if TEST_STD_VER <= 14 std::strstream::open_mode b = std::strstream::app; assert(b == std::ios::app); -#endif return 0; } diff --git a/libcxx/test/std/depr/depr.ios.members/seek_dir.pass.cpp b/libcxx/test/std/depr/depr.ios.members/seek_dir.pass.cpp index 5b48073bd2f11..b68e53694ddca 100644 --- a/libcxx/test/std/depr/depr.ios.members/seek_dir.pass.cpp +++ b/libcxx/test/std/depr/depr.ios.members/seek_dir.pass.cpp @@ -6,6 +6,10 @@ // //===----------------------------------------------------------------------===// +// REQUIRES: c++03 || c++11 || c++14 + +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // // class ios_base @@ -22,10 +26,8 @@ int main(int, char**) { -#if TEST_STD_VER <= 14 std::strstream::seek_dir b = std::strstream::cur; assert(b == std::ios::cur); -#endif return 0; } diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp index 9823361b509e5..b5ee0bfbecf08 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class istrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp index 0808f45ba52d8..4d0d6731338a8 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/ccp_size.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class istrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp index 681d7b7cf4c0d..58980949732dc 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class istrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp index 96f17b0072790..e13e20e20f8c0 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.cons/cp_size.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class istrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp index 80401264f3020..449114a0f6a91 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/rdbuf.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class istrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp index 0ee341b226fbe..e7c063750fb63 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.istrstream.members/str.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class istrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp new file mode 100644 index 0000000000000..2ab252e934948 --- /dev/null +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/depr.verify.cpp @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// check that istrstream is marked deprecated + +#include + +std::istrstream s("abc"); // expected-warning {{'istrstream' is deprecated}} diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp index 93cdbf677ec39..be1a9e1251ba9 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.istrstream/types.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class istrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp index 64a12ab3620a4..8698983a7ebc5 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/cp_size_mode.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class ostrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp index 6960f75c59fc5..abbf6af7e11eb 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.cons/default.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class ostrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp index f52ff6c13ce2c..854e68b17249c 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/freeze.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class ostrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp index 080e05000c888..9830aeb6fc8c6 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/pcount.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class ostrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp index df38ea977ed37..f9a859dd8dab6 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/rdbuf.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class ostrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp index 5de41cd72306e..72f665af5851c 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.ostrstream.members/str.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class ostrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp new file mode 100644 index 0000000000000..e0c805fee1077 --- /dev/null +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/depr.verify.cpp @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// check that ostrstream is marked deprecated + +#include + +std::ostrstream s; // expected-warning {{'ostrstream' is deprecated}} diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp index 3e753702065bc..6a71c44a90cf6 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.ostrstream/types.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class ostrstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp index 96c2890c2cc07..a85e1328b3514 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/cp_size_mode.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp index 907985b3de40d..390162ef0f180 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.cons/default.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp index 210e73057f087..3fe277a48cbba 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.dest/rdbuf.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp index f5de291a87974..263fddef6c346 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/freeze.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp index abf928ffe5aaa..b053cf128ab4c 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/pcount.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp index 59ee9095ec6a1..3d251d9a99cd3 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.strstream.oper/str.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp new file mode 100644 index 0000000000000..03655226a54b2 --- /dev/null +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/depr.verify.cpp @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// check that strstream is marked deprecated + +#include + +std::strstream s; // expected-warning {{'strstream' is deprecated}} diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp index 3e9705366428b..fb543841e8f36 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstream/types.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstream diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp index 57969fb374ffa..8f81707973d89 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ccp_size.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp index 4fe716006c191..25a9617dcdaf5 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cp_size_cp.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp index 001151c367d78..fc3386ff5575e 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cscp_size.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp index b4efb11c9881f..a74c504fd2785 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/cucp_size.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp index 83e3009ce4f83..756427df48206 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/custom_alloc.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp index 844f31e4a454c..81924c995b151 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/default.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp index afe34072cb91f..b8991a8fc433b 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/scp_size_scp.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp index e7b8d6d139cfa..1d3463f3cac17 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.cons/ucp_size_ucp.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp index 54587c5941a3f..93eec8dde3970 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/freeze.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp index a0a27fa7a600c..5b973cff0ce01 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/overflow.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp index 1bf17d94ed02a..b64c9dcb44470 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/pcount.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp index 841334b373f37..d6c8b8e7e11b6 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.members/str.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp index b2267a0aac227..37109c7e942c9 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/overflow.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp index 1f92b2057da8b..698953f7121ae 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/pbackfail.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp index 94b1346ed82f3..d98e6f73f7aa8 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekoff.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp index f7e96ed4020cb..be88f5aecc5cd 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/seekpos.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp index e663862d39073..ce7612bc66a92 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/setbuf.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp index 6aa4b434253b0..4fc79b575c7ab 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.strstreambuf.virtuals/underflow.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp new file mode 100644 index 0000000000000..a598acbba8c80 --- /dev/null +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/depr.verify.cpp @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// check that strstreambuf is marked deprecated + +#include + +std::strstreambuf s; // expected-warning {{'strstreambuf' is deprecated}} diff --git a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp index bf86406c96f48..bc312cbbb2254 100644 --- a/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp +++ b/libcxx/test/std/depr/depr.str.strstreams/depr.strstreambuf/types.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // class strstreambuf From 0ee73debf7445a9a34dcdf0215a99a2919d00112 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sat, 27 May 2023 16:52:37 +0200 Subject: [PATCH 108/704] [libc++][format] Fixes year formatter on Windows. Windows' libc, like some other libc implementations do not work as specified for %Y and %y. This uses the fixes used for other libc implementations. The work was part of D150593. Reviewed By: #libc, ldionne Differential Revision: https://reviews.llvm.org/D151612 --- libcxx/include/__chrono/formatter.h | 17 ++++--- .../time.cal.year.nonmembers/ostream.pass.cpp | 44 ++++++++++--------- .../std/time/time.syn/formatter.year.pass.cpp | 24 +++++----- 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h index c8d5993be1961..679edf39cbb39 100644 --- a/libcxx/include/__chrono/formatter.h +++ b/libcxx/include/__chrono/formatter.h @@ -270,20 +270,19 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( // // TODO FMT evaluate the comment above. -# if defined(__GLIBC__) || defined(_AIX) +# if defined(__GLIBC__) || defined(_AIX) || defined(_WIN32) case _CharT('y'): // Glibc fails for negative values, AIX for positive values too. __sstr << std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "{:02}"), (std::abs(__t.tm_year + 1900)) % 100); break; -# endif // defined(__GLIBC__) || defined(_AIX) +# endif // defined(__GLIBC__) || defined(_AIX) || defined(_WIN32) - case _CharT('Y'): { - int __year = __t.tm_year + 1900; - if (__year < 1000) - __formatter::__format_year(__year, __sstr); - else - __facet.put({__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1)); - } break; + case _CharT('Y'): + // Depending on the platform's libc the range of supported years is + // limited. Intead of of testing all conditions use the internal + // implementation unconditionally. + __formatter::__format_year(__t.tm_year + 1900, __sstr); + break; case _CharT('F'): { int __year = __t.tm_year + 1900; diff --git a/libcxx/test/std/time/time.cal/time.cal.year/time.cal.year.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.year/time.cal.year.nonmembers/ostream.pass.cpp index 6e95849e9faf4..cd565b82bc2a6 100644 --- a/libcxx/test/std/time/time.cal/time.cal.year/time.cal.year.nonmembers/ostream.pass.cpp +++ b/libcxx/test/std/time/time.cal/time.cal.year/time.cal.year.nonmembers/ostream.pass.cpp @@ -9,9 +9,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-localization -// TODO FMT Investigate Windows issues. -// UNSUPPORTED: msvc, target={{.+}}-windows-gnu - // TODO FMT Fix this test using GCC, it currently crashes. // UNSUPPORTED: gcc-12 @@ -35,9 +32,16 @@ #include "make_string.h" #include "platform_support.h" // locale name macros #include "test_macros.h" +#include "assert_macros.h" +#include "concat_macros.h" #define SV(S) MAKE_STRING_VIEW(CharT, S) +#define TEST_EQUAL(OUT, EXPECTED) \ + TEST_REQUIRE(OUT == EXPECTED, \ + TEST_WRITE_CONCATENATED( \ + "\nExpression ", #OUT, "\nExpected output ", EXPECTED, "\nActual output ", OUT, '\n')); + template static std::basic_string stream_c_locale(std::chrono::year year) { std::basic_stringstream sstr; @@ -65,23 +69,23 @@ static std::basic_string stream_ja_JP_locale(std::chrono::year year) { template static void test() { - assert(stream_c_locale(std::chrono::year{-32'768}) == SV("-32768 is not a valid year")); - assert(stream_c_locale(std::chrono::year{-32'767}) == SV("-32767")); - assert(stream_c_locale(std::chrono::year{0}) == SV("0000")); - assert(stream_c_locale(std::chrono::year{1970}) == SV("1970")); - assert(stream_c_locale(std::chrono::year{32'767}) == SV("32767")); - - assert(stream_fr_FR_locale(std::chrono::year{-32'768}) == SV("-32768 is not a valid year")); - assert(stream_fr_FR_locale(std::chrono::year{-32'767}) == SV("-32767")); - assert(stream_fr_FR_locale(std::chrono::year{0}) == SV("0000")); - assert(stream_fr_FR_locale(std::chrono::year{1970}) == SV("1970")); - assert(stream_fr_FR_locale(std::chrono::year{32'767}) == SV("32767")); - - assert(stream_ja_JP_locale(std::chrono::year{-32'768}) == SV("-32768 is not a valid year")); - assert(stream_ja_JP_locale(std::chrono::year{-32'767}) == SV("-32767")); - assert(stream_ja_JP_locale(std::chrono::year{0}) == SV("0000")); - assert(stream_ja_JP_locale(std::chrono::year{1970}) == SV("1970")); - assert(stream_ja_JP_locale(std::chrono::year{32'767}) == SV("32767")); + TEST_EQUAL(stream_c_locale(std::chrono::year{-32'768}), SV("-32768 is not a valid year")); + TEST_EQUAL(stream_c_locale(std::chrono::year{-32'767}), SV("-32767")); + TEST_EQUAL(stream_c_locale(std::chrono::year{0}), SV("0000")); + TEST_EQUAL(stream_c_locale(std::chrono::year{1970}), SV("1970")); + TEST_EQUAL(stream_c_locale(std::chrono::year{32'767}), SV("32767")); + + TEST_EQUAL(stream_fr_FR_locale(std::chrono::year{-32'768}), SV("-32768 is not a valid year")); + TEST_EQUAL(stream_fr_FR_locale(std::chrono::year{-32'767}), SV("-32767")); + TEST_EQUAL(stream_fr_FR_locale(std::chrono::year{0}), SV("0000")); + TEST_EQUAL(stream_fr_FR_locale(std::chrono::year{1970}), SV("1970")); + TEST_EQUAL(stream_fr_FR_locale(std::chrono::year{32'767}), SV("32767")); + + TEST_EQUAL(stream_ja_JP_locale(std::chrono::year{-32'768}), SV("-32768 is not a valid year")); + TEST_EQUAL(stream_ja_JP_locale(std::chrono::year{-32'767}), SV("-32767")); + TEST_EQUAL(stream_ja_JP_locale(std::chrono::year{0}), SV("0000")); + TEST_EQUAL(stream_ja_JP_locale(std::chrono::year{1970}), SV("1970")); + TEST_EQUAL(stream_ja_JP_locale(std::chrono::year{32'767}), SV("32767")); } int main(int, char**) { diff --git a/libcxx/test/std/time/time.syn/formatter.year.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year.pass.cpp index c29140c8192c5..33c427932573a 100644 --- a/libcxx/test/std/time/time.syn/formatter.year.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.year.pass.cpp @@ -10,9 +10,6 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // UNSUPPORTED: no-localization -// TODO FMT Investigate Windows issues. -// UNSUPPORTED: msvc, target={{.+}}-windows-gnu - // TODO FMT Fix this test using GCC, it currently crashes. // UNSUPPORTED: gcc-12 @@ -88,7 +85,7 @@ static void test_valid_values() { // Non localized output using C-locale check(SV("%C='00'\t" -#if defined(__APPLE__) +#if defined(__APPLE__) || defined(_WIN32) "%EC='00'\t" #else "%EC='0'\t" @@ -97,7 +94,7 @@ static void test_valid_values() { "%Ey='00'\t" "%Oy='00'\t" "%Y='0000'\t" -#if defined(__APPLE__) +#if defined(__APPLE__) || defined(_WIN32) "%EY='0000'\t" #elif defined(_AIX) "%EY=''\t" @@ -132,7 +129,7 @@ static void test_valid_values() { // Use the global locale (fr_FR) check(SV("%C='00'\t" -#if defined(__APPLE__) +#if defined(__APPLE__) || defined(_WIN32) "%EC='00'\t" #else "%EC='0'\t" @@ -141,7 +138,7 @@ static void test_valid_values() { "%Ey='00'\t" "%Oy='00'\t" "%Y='0000'\t" -#if defined(__APPLE__) +#if defined(__APPLE__) || defined(_WIN32) "%EY='0000'\t" #elif defined(_AIX) "%EY=''\t" @@ -175,10 +172,10 @@ static void test_valid_values() { std::chrono::year{2038}); // Use supplied locale (ja_JP). This locale has a different alternate. -#if defined(__APPLE__) || defined(_AIX) +#if defined(__APPLE__) || defined(_AIX) || defined(_WIN32) check(SV("%C='00'\t" -# if defined(__APPLE__) +# if defined(__APPLE__) || defined(_WIN32) "%EC='00'\t" # else "%EC='0'\t" @@ -218,12 +215,12 @@ static void test_valid_values() { lfmt, std::chrono::year{2038}); -#else // defined(__APPLE__) || defined(_AIX) +#else // defined(__APPLE__) || defined(_AIX) || defined(_WIN32) check(loc, SV("%C='00'\t" "%EC='紀元前'\t" "%y='00'\t" -// https://sourceware.org/bugzilla/show_bug.cgi?id=23758 + // https://sourceware.org/bugzilla/show_bug.cgi?id=23758 # if defined(__GLIBC__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 29 "%Ey='1'\t" # else @@ -231,7 +228,7 @@ static void test_valid_values() { # endif "%Oy='〇'\t" "%Y='0000'\t" -// https://sourceware.org/bugzilla/show_bug.cgi?id=23758 + // https://sourceware.org/bugzilla/show_bug.cgi?id=23758 # if defined(__GLIBC__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 29 "%EY='紀元前1年'\t" # else @@ -265,7 +262,7 @@ static void test_valid_values() { "\n"), lfmt, std::chrono::year{2038}); -#endif // defined(__APPLE__) || defined(_AIX) +#endif // defined(__APPLE__) || defined(_AIX) || defined(_WIN32) std::locale::global(std::locale::classic()); } @@ -273,7 +270,6 @@ static void test_valid_values() { template static void test_padding() { constexpr std::basic_string_view fmt = SV("{:%%C='%C'%t%%y='%y'%t%%Y='%Y'%t%n}"); - check(SV("%C='-100'\t%y='99'\t%Y='-9999'\t\n"), fmt, std::chrono::year{-9'999}); check(SV("%C='-10'\t%y='99'\t%Y='-0999'\t\n"), fmt, std::chrono::year{-999}); check(SV("%C='-1'\t%y='99'\t%Y='-0099'\t\n"), fmt, std::chrono::year{-99}); From ab70b63a71abc741f12ed5ba2ff438fe44f46e5d Mon Sep 17 00:00:00 2001 From: max Date: Tue, 30 May 2023 10:46:55 -0500 Subject: [PATCH 109/704] [MLIR][CAPI] Move `DenseMapInfo` I mistakenly put this in `mlir/CAPI/Support.h` at some point during the flurry of refactoring of `TypeCaster`s but as @jpienaar rightly pointed out, it doesn't belong there. Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D151669 --- mlir/include/mlir/CAPI/Support.h | 21 --------------------- mlir/lib/Bindings/Python/PybindUtils.h | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/mlir/include/mlir/CAPI/Support.h b/mlir/include/mlir/CAPI/Support.h index e42413dbe6d28..f3e8a67e0ac36 100644 --- a/mlir/include/mlir/CAPI/Support.h +++ b/mlir/include/mlir/CAPI/Support.h @@ -44,25 +44,4 @@ inline mlir::LogicalResult unwrap(MlirLogicalResult res) { DEFINE_C_API_METHODS(MlirTypeID, mlir::TypeID) DEFINE_C_API_PTR_METHODS(MlirTypeIDAllocator, mlir::TypeIDAllocator) -namespace llvm { - -template <> -struct DenseMapInfo { - static inline MlirTypeID getEmptyKey() { - auto *pointer = llvm::DenseMapInfo::getEmptyKey(); - return mlirTypeIDCreate(pointer); - } - static inline MlirTypeID getTombstoneKey() { - auto *pointer = llvm::DenseMapInfo::getTombstoneKey(); - return mlirTypeIDCreate(pointer); - } - static inline unsigned getHashValue(const MlirTypeID &val) { - return mlirTypeIDHashValue(val); - } - static inline bool isEqual(const MlirTypeID &lhs, const MlirTypeID &rhs) { - return mlirTypeIDEqual(lhs, rhs); - } -}; -} // namespace llvm - #endif // MLIR_CAPI_SUPPORT_H diff --git a/mlir/lib/Bindings/Python/PybindUtils.h b/mlir/lib/Bindings/Python/PybindUtils.h index 41de7e9b46695..2a8da20bee049 100644 --- a/mlir/lib/Bindings/Python/PybindUtils.h +++ b/mlir/lib/Bindings/Python/PybindUtils.h @@ -354,4 +354,25 @@ class Sliceable { } // namespace mlir +namespace llvm { + +template <> +struct DenseMapInfo { + static inline MlirTypeID getEmptyKey() { + auto *pointer = llvm::DenseMapInfo::getEmptyKey(); + return mlirTypeIDCreate(pointer); + } + static inline MlirTypeID getTombstoneKey() { + auto *pointer = llvm::DenseMapInfo::getTombstoneKey(); + return mlirTypeIDCreate(pointer); + } + static inline unsigned getHashValue(const MlirTypeID &val) { + return mlirTypeIDHashValue(val); + } + static inline bool isEqual(const MlirTypeID &lhs, const MlirTypeID &rhs) { + return mlirTypeIDEqual(lhs, rhs); + } +}; +} // namespace llvm + #endif // MLIR_BINDINGS_PYTHON_PYBINDUTILS_H From 86821b54eb29740cf67c1d6cd3589f31cb66fd46 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 30 May 2023 10:07:14 -0700 Subject: [PATCH 110/704] [RISCV] Add copyright header to IntrinsicsRISCVXTHead.td and IntrinsicsRISCVXsf.td. NFC --- llvm/include/llvm/IR/IntrinsicsRISCVXTHead.td | 12 ++++++++++++ llvm/include/llvm/IR/IntrinsicsRISCVXsf.td | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXTHead.td b/llvm/include/llvm/IR/IntrinsicsRISCVXTHead.td index 8486b678022b4..5af10a3e197aa 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCVXTHead.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCVXTHead.td @@ -1,3 +1,15 @@ +//===- IntrinsicsRISCVXTHead.td - T-Head intrinsics --------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines all of the T-Head vendor intrinsics for RISC-V. +// +//===----------------------------------------------------------------------===// + let TargetPrefix = "riscv" in { class TH_VdotTernaryWideMasked diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td index 67105944616c3..0c8da35491cef 100644 --- a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td @@ -1,3 +1,15 @@ +//===- IntrinsicsRISCVXsf.td - SiFive intrinsics -----------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines all of the SiFive vendor intrinsics for RISC-V. +// +//===----------------------------------------------------------------------===// + class VCIXSuffix { list suffix = !cond(!eq(range, "c"): ["e8mf8", "e8mf4", "e8mf2", "e8m1", "e8m2", "e8m4", "e8m8"], !eq(range, "s"): ["e16mf4", "e16mf2", "e16m1", "e16m2", "e16m4", "e16m8"], From 5c000df21531fc4b614cf80d29c5f05a664aaa41 Mon Sep 17 00:00:00 2001 From: Peter Steinfeld Date: Tue, 30 May 2023 07:48:51 -0700 Subject: [PATCH 111/704] [flang] [NFC] Remove an unneeded include. The title says it all. Differential Revision: https://reviews.llvm.org/D151712 --- flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp index 72fb630dc2526..63d66adf222f6 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp @@ -10,7 +10,6 @@ #include "flang/Optimizer/Builder/BoxValue.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" -#include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" #include "flang/Parser/parse-tree.h" #include "flang/Runtime/misc-intrinsic.h" From 85670ac86813b170c9301aa477421c56a71a7e1e Mon Sep 17 00:00:00 2001 From: Ben Hamilton Date: Fri, 26 May 2023 12:44:21 -0600 Subject: [PATCH 112/704] [Format/ObjC] Support NS_ASSUME_NONNULL_BEGIN and FOUNDATION_EXPORT in ObjC language guesser This adds to the ObjC language guesser a few more common macros used in ObjC headers. These can help distinguish ObjC headers which otherwise lack ObjC types from C++ headers. Contributed by danblakemore. Tested: New tests included. Ran unit tests with: ``` % cmake -S llvm -B build -G Ninja && \ ninja -C build FormatTests && \ ./build/tools/clang/unittests/Format/FormatTests --gtest_filter="*FormatTestObjC*" (snip) [----------] 24 tests from FormatTestObjC (265 ms total) [----------] Global test environment tear-down [==========] 26 tests from 2 test suites ran. (270 ms total) [ PASSED ] 26 tests. ``` Reviewed By: MyDeveloperDay Differential Revision: https://reviews.llvm.org/D151578 --- clang/lib/Format/Format.cpp | 3 +++ clang/test/Format/dump-config-objc-macros.h | 8 ++++++++ clang/unittests/Format/FormatTestObjC.cpp | 20 ++++++++++++++++++++ 3 files changed, 31 insertions(+) create mode 100644 clang/test/Format/dump-config-objc-macros.h diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 154e6a21981e8..d7128ed558dc5 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -2687,6 +2687,8 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer { "CGSizeMake", "CGVector", "CGVectorMake", + "FOUNDATION_EXPORT", // This is an alias for FOUNDATION_EXTERN. + "FOUNDATION_EXTERN", "NSAffineTransform", "NSArray", "NSAttributedString", @@ -2743,6 +2745,7 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer { "NSURLQueryItem", "NSUUID", "NSValue", + "NS_ASSUME_NONNULL_BEGIN", "UIImage", "UIView", }; diff --git a/clang/test/Format/dump-config-objc-macros.h b/clang/test/Format/dump-config-objc-macros.h new file mode 100644 index 0000000000000..c90aa6067b1f4 --- /dev/null +++ b/clang/test/Format/dump-config-objc-macros.h @@ -0,0 +1,8 @@ +// RUN: clang-format -dump-config %s | FileCheck %s + +// CHECK: Language: ObjC +NS_ASSUME_NONNULL_BEGIN + +FOUNDATION_EXTERN int kConstant; + +NS_ASSUME_NONNULL_END diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp index 55969ff6fd4a9..0cae6e2950fe6 100644 --- a/clang/unittests/Format/FormatTestObjC.cpp +++ b/clang/unittests/Format/FormatTestObjC.cpp @@ -94,6 +94,26 @@ TEST(FormatTestObjCStyle, DetectsObjCInHeaders) { ASSERT_TRUE((bool)Style); EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language); + Style = getStyle("{}", "a.h", "none", R"objc( +NS_ASSUME_NONNULL_BEGIN +extern int i; +NS_ASSUME_NONNULL_END +)objc"); + ASSERT_TRUE((bool)Style); + EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language); + + Style = getStyle("{}", "a.h", "none", R"objc( +FOUNDATION_EXTERN void DoStuff(void); +)objc"); + ASSERT_TRUE((bool)Style); + EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language); + + Style = getStyle("{}", "a.h", "none", R"objc( +FOUNDATION_EXPORT void DoStuff(void); +)objc"); + ASSERT_TRUE((bool)Style); + EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language); + Style = getStyle("{}", "a.h", "none", "enum Foo {};"); ASSERT_TRUE((bool)Style); EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language); From d8291908ef49e0d560276c19c552a67d9fb3ef1d Mon Sep 17 00:00:00 2001 From: Paul Robinson Date: Tue, 30 May 2023 10:13:46 -0700 Subject: [PATCH 113/704] [Headers][doc] Add add/sub/mul intrinsic descriptions to avx2intrin.h Differential Revision: https://reviews.llvm.org/D150114 --- clang/lib/Headers/avx2intrin.h | 599 +++++++++++++++++++++++++++++++++ 1 file changed, 599 insertions(+) diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 1fbc4edafbd7f..be4d30a98195e 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -65,48 +65,150 @@ _mm256_packus_epi32(__m256i __V1, __m256i __V2) return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); } +/// Adds 8-bit integers from corresponding bytes of two 256-bit integer +/// vectors and returns the lower 8 bits of each sum in the corresponding +/// byte of the 256-bit integer vector result (overflow is ignored). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPADDB instruction. +/// +/// \param __a +/// A 256-bit integer vector containing one of the source operands. +/// \param __b +/// A 256-bit integer vector containing one of the source operands. +/// \returns A 256-bit integer vector containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8(__m256i __a, __m256i __b) { return (__m256i)((__v32qu)__a + (__v32qu)__b); } +/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of +/// [16 x i16] and returns the lower 16 bits of each sum in the +/// corresponding element of the [16 x i16] result (overflow is ignored). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPADDW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hu)__a + (__v16hu)__b); } +/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of +/// [8 x i32] and returns the lower 32 bits of each sum in the corresponding +/// element of the [8 x i32] result (overflow is ignored). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPADDD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \returns A 256-bit vector of [8 x i32] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32(__m256i __a, __m256i __b) { return (__m256i)((__v8su)__a + (__v8su)__b); } +/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of +/// [4 x i64] and returns the lower 64 bits of each sum in the corresponding +/// element of the [4 x i64] result (overflow is ignored). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPADDQ instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x i64] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [4 x i64] containing one of the source operands. +/// \returns A 256-bit vector of [4 x i64] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi64(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a + (__v4du)__b); } +/// Adds 8-bit integers from corresponding bytes of two 256-bit integer +/// vectors using signed saturation, and returns each sum in the +/// corresponding byte of the 256-bit integer vector result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPADDSB instruction. +/// +/// \param __a +/// A 256-bit integer vector containing one of the source operands. +/// \param __b +/// A 256-bit integer vector containing one of the source operands. +/// \returns A 256-bit integer vector containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b); } +/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of +/// [16 x i16] using signed saturation, and returns the [16 x i16] result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPADDSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b); } +/// Adds 8-bit integers from corresponding bytes of two 256-bit integer +/// vectors using unsigned saturation, and returns each sum in the +/// corresponding byte of the 256-bit integer vector result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPADDUSB instruction. +/// +/// \param __a +/// A 256-bit integer vector containing one of the source operands. +/// \param __b +/// A 256-bit integer vector containing one of the source operands. +/// \returns A 256-bit integer vector containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b); } +/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of +/// [16 x i16] using unsigned saturation, and returns the [16 x i16] result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPADDUSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu16(__m256i __a, __m256i __b) { @@ -202,48 +304,269 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b) return (__m256i)((__v4di)__a > (__v4di)__b); } +/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit +/// vectors of [16 x i16] and returns the lower 16 bits of each sum in an +/// element of the [16 x i16] result (overflow is ignored). Sums from +/// \a __a are returned in the lower 64 bits of each 128-bit half of the +/// result; sums from \a __b are returned in the upper 64 bits of each +/// 128-bit half of the result. +/// +/// \code{.operation} +/// FOR i := 0 TO 1 +/// j := i*128 +/// result[j+15:j] := __a[j+15:j] + __a[j+31:j+16] +/// result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48] +/// result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80] +/// result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112] +/// result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16] +/// result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48] +/// result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80] +/// result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112] +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHADDW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); } +/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit +/// vectors of [8 x i32] and returns the lower 32 bits of each sum in an +/// element of the [8 x i32] result (overflow is ignored). Sums from \a __a +/// are returned in the lower 64 bits of each 128-bit half of the result; +/// sums from \a __b are returned in the upper 64 bits of each 128-bit half +/// of the result. +/// +/// \code{.operation} +/// FOR i := 0 TO 1 +/// j := i*128 +/// result[j+31:j] := __a[j+31:j] + __a[j+63:j+32] +/// result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96] +/// result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32] +/// result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96] +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHADDD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \returns A 256-bit vector of [8 x i32] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); } +/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit +/// vectors of [16 x i16] using signed saturation and returns each sum in +/// an element of the [16 x i16] result. Sums from \a __a are returned in +/// the lower 64 bits of each 128-bit half of the result; sums from \a __b +/// are returned in the upper 64 bits of each 128-bit half of the result. +/// +/// \code{.operation} +/// FOR i := 0 TO 1 +/// j := i*128 +/// result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16]) +/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48]) +/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80]) +/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112]) +/// result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16]) +/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48]) +/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80]) +/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112]) +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHADDSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the sums. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadds_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); } +/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit +/// vectors of [16 x i16] and returns the lower 16 bits of each difference +/// in an element of the [16 x i16] result (overflow is ignored). +/// Differences from \a __a are returned in the lower 64 bits of each +/// 128-bit half of the result; differences from \a __b are returned in the +/// upper 64 bits of each 128-bit half of the result. +/// +/// \code{.operation} +/// FOR i := 0 TO 1 +/// j := i*128 +/// result[j+15:j] := __a[j+15:j] - __a[j+31:j+16] +/// result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48] +/// result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80] +/// result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112] +/// result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16] +/// result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48] +/// result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80] +/// result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112] +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHSUBW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); } +/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit +/// vectors of [8 x i32] and returns the lower 32 bits of each difference in +/// an element of the [8 x i32] result (overflow is ignored). Differences +/// from \a __a are returned in the lower 64 bits of each 128-bit half of +/// the result; differences from \a __b are returned in the upper 64 bits +/// of each 128-bit half of the result. +/// +/// \code{.operation} +/// FOR i := 0 TO 1 +/// j := i*128 +/// result[j+31:j] := __a[j+31:j] - __a[j+63:j+32] +/// result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96] +/// result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32] +/// result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96] +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHSUBD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \returns A 256-bit vector of [8 x i32] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); } +/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit +/// vectors of [16 x i16] using signed saturation and returns each sum in +/// an element of the [16 x i16] result. Differences from \a __a are +/// returned in the lower 64 bits of each 128-bit half of the result; +/// differences from \a __b are returned in the upper 64 bits of each +/// 128-bit half of the result. +/// +/// \code{.operation} +/// FOR i := 0 TO 1 +/// j := i*128 +/// result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16]) +/// result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48]) +/// result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80]) +/// result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112]) +/// result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16]) +/// result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48]) +/// result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80]) +/// result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112]) +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHSUBSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsubs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); } +/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a +/// with the corresponding signed byte from the 256-bit integer vector in +/// \a __b, forming signed 16-bit intermediate products. Adds adjacent +/// pairs of those products using signed saturation to form 16-bit sums +/// returned as elements of the [16 x i16] result. +/// +/// \code{.operation} +/// FOR i := 0 TO 15 +/// j := i*16 +/// temp1 := __a[j+7:j] * __b[j+7:j] +/// temp2 := __a[j+15:j+8] * __b[j+15:j+8] +/// result[j+15:j] := SATURATE16(temp1 + temp2) +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMADDUBSW instruction. +/// +/// \param __a +/// A 256-bit vector containing one of the source operands. +/// \param __b +/// A 256-bit vector containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maddubs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); } +/// Multiplies corresponding 16-bit elements of two 256-bit vectors of +/// [16 x i16], forming 32-bit intermediate products, and adds pairs of +/// those products to form 32-bit sums returned as elements of the +/// [8 x i32] result. +/// +/// There is only one wraparound case: when all four of the 16-bit sources +/// are \c 0x8000, the result will be \c 0x80000000. +/// +/// \code{.operation} +/// FOR i := 0 TO 7 +/// j := i*32 +/// temp1 := __a[j+15:j] * __b[j+15:j] +/// temp2 := __a[j+31:j+16] * __b[j+31:j+16] +/// result[j+31:j] := temp1 + temp2 +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMADDWD instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [8 x i32] containing the result. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16(__m256i __a, __m256i __b) { @@ -406,42 +729,155 @@ _mm256_cvtepu32_epi64(__m128i __V) return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); } +/// Multiplies signed 32-bit integers from even-numbered elements of two +/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the +/// [4 x i64] result. +/// +/// \code{.operation} +/// result[63:0] := __a[31:0] * __b[31:0] +/// result[127:64] := __a[95:64] * __b[95:64] +/// result[191:128] := __a[159:128] * __b[159:128] +/// result[255:192] := __a[223:192] * __b[223:192] +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMULDQ instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \returns A 256-bit vector of [4 x i64] containing the products. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); } +/// Multiplies signed 16-bit integer elements of two 256-bit vectors of +/// [16 x i16], truncates the 32-bit results to the most significant 18 +/// bits, rounds by adding 1, and returns bits [16:1] of each rounded +/// product in the [16 x i16] result. +/// +/// \code{.operation} +/// FOR i := 0 TO 15 +/// j := i*16 +/// temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1 +/// result[j+15:j] := temp[16:1] +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMULHRSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the rounded products. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhrs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); } +/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of +/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the +/// [16 x i16] result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMULHUW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the products. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); } +/// Multiplies signed 16-bit integer elements of two 256-bit vectors of +/// [16 x i16], and returns the upper 16 bits of each 32-bit product in the +/// [16 x i16] result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMULHW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the products. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); } +/// Multiplies signed 16-bit integer elements of two 256-bit vectors of +/// [16 x i16], and returns the lower 16 bits of each 32-bit product in the +/// [16 x i16] result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMULLW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [16 x i16] containing one of the source operands. +/// \returns A 256-bit vector of [16 x i16] containing the products. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hu)__a * (__v16hu)__b); } +/// Multiplies signed 32-bit integer elements of two 256-bit vectors of +/// [8 x i32], and returns the lower 32 bits of each 64-bit product in the +/// [8 x i32] result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMULLD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \returns A 256-bit vector of [8 x i32] containing the products. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi32 (__m256i __a, __m256i __b) { return (__m256i)((__v8su)__a * (__v8su)__b); } +/// Multiplies unsigned 32-bit integers from even-numered elements of two +/// 256-bit vectors of [8 x i32] and returns the 64-bit products in the +/// [4 x i64] result. +/// +/// \code{.operation} +/// result[63:0] := __a[31:0] * __b[31:0] +/// result[127:64] := __a[95:64] * __b[95:64] +/// result[191:128] := __a[159:128] * __b[159:128] +/// result[255:192] := __a[223:192] * __b[223:192] +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMULUDQ instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x i32] containing one of the source operands. +/// \returns A 256-bit vector of [4 x i64] containing the products. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epu32(__m256i __a, __m256i __b) { @@ -897,48 +1333,211 @@ _mm256_srl_epi64(__m256i __a, __m128i __count) return __builtin_ia32_psrlq256((__v4di)__a, __count); } +/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer +/// vectors. Returns the lower 8 bits of each difference in the +/// corresponding byte of the 256-bit integer vector result (overflow is +/// ignored). +/// +/// \code{.operation} +/// FOR i := 0 TO 31 +/// j := i*8 +/// result[j+7:j] := __a[j+7:j] - __b[j+7:j] +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSUBB instruction. +/// +/// \param __a +/// A 256-bit integer vector containing the minuends. +/// \param __b +/// A 256-bit integer vector containing the subtrahends. +/// \returns A 256-bit integer vector containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8(__m256i __a, __m256i __b) { return (__m256i)((__v32qu)__a - (__v32qu)__b); } +/// Subtracts 16-bit integers from corresponding elements of two 256-bit +/// vectors of [16 x i16]. Returns the lower 16 bits of each difference in +/// the corresponding element of the [16 x i16] result (overflow is +/// ignored). +/// +/// \code{.operation} +/// FOR i := 0 TO 15 +/// j := i*16 +/// result[j+15:j] := __a[j+15:j] - __b[j+15:j] +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSUBW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing the minuends. +/// \param __b +/// A 256-bit vector of [16 x i16] containing the subtrahends. +/// \returns A 256-bit vector of [16 x i16] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hu)__a - (__v16hu)__b); } +/// Subtracts 32-bit integers from corresponding elements of two 256-bit +/// vectors of [8 x i32]. Returns the lower 32 bits of each difference in +/// the corresponding element of the [8 x i32] result (overflow is ignored). +/// +/// \code{.operation} +/// FOR i := 0 TO 7 +/// j := i*32 +/// result[j+31:j] := __a[j+31:j] - __b[j+31:j] +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSUBD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32] containing the minuends. +/// \param __b +/// A 256-bit vector of [8 x i32] containing the subtrahends. +/// \returns A 256-bit vector of [8 x i32] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32(__m256i __a, __m256i __b) { return (__m256i)((__v8su)__a - (__v8su)__b); } +/// Subtracts 64-bit integers from corresponding elements of two 256-bit +/// vectors of [4 x i64]. Returns the lower 64 bits of each difference in +/// the corresponding element of the [4 x i64] result (overflow is ignored). +/// +/// \code{.operation} +/// FOR i := 0 TO 3 +/// j := i*64 +/// result[j+63:j] := __a[j+63:j] - __b[j+63:j] +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSUBQ instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x i64] containing the minuends. +/// \param __b +/// A 256-bit vector of [4 x i64] containing the subtrahends. +/// \returns A 256-bit vector of [4 x i64] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi64(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a - (__v4du)__b); } +/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer +/// vectors using signed saturation, and returns each differences in the +/// corresponding byte of the 256-bit integer vector result. +/// +/// \code{.operation} +/// FOR i := 0 TO 31 +/// j := i*8 +/// result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j]) +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSUBSB instruction. +/// +/// \param __a +/// A 256-bit integer vector containing the minuends. +/// \param __b +/// A 256-bit integer vector containing the subtrahends. +/// \returns A 256-bit integer vector containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b); } +/// Subtracts 16-bit integers from corresponding elements of two 256-bit +/// vectors of [16 x i16] using signed saturation, and returns each +/// difference in the corresponding element of the [16 x i16] result. +/// +/// \code{.operation} +/// FOR i := 0 TO 15 +/// j := i*16 +/// result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j]) +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSUBSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing the minuends. +/// \param __b +/// A 256-bit vector of [16 x i16] containing the subtrahends. +/// \returns A 256-bit vector of [16 x i16] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b); } +/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer +/// vectors using unsigned saturation, and returns each difference in the +/// corresponding byte of the 256-bit integer vector result. For each byte, +/// computes result = __a - __b . +/// +/// \code{.operation} +/// FOR i := 0 TO 31 +/// j := i*8 +/// result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j]) +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSUBUSB instruction. +/// +/// \param __a +/// A 256-bit integer vector containing the minuends. +/// \param __b +/// A 256-bit integer vector containing the subtrahends. +/// \returns A 256-bit integer vector containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b); } +/// Subtracts 16-bit integers from corresponding elements of two 256-bit +/// vectors of [16 x i16] using unsigned saturation, and returns each +/// difference in the corresponding element of the [16 x i16] result. +/// +/// \code{.operation} +/// FOR i := 0 TO 15 +/// j := i*16 +/// result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j]) +/// ENDFOR +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSUBUSW instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x i16] containing the minuends. +/// \param __b +/// A 256-bit vector of [16 x i16] containing the subtrahends. +/// \returns A 256-bit vector of [16 x i16] containing the differences. static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu16(__m256i __a, __m256i __b) { From 24172de17d38787c891be69ccaef408b65a1e5e4 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 30 May 2023 10:21:15 -0700 Subject: [PATCH 114/704] [RISCV] Add tests for vslide1down shuffle/insert idiom --- .../rvv/fixed-vector-shuffle-vslide1down.ll | 320 ++++++++++++++++++ 1 file changed, 320 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll new file mode 100644 index 0000000000000..67b729aad971b --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll @@ -0,0 +1,320 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+experimental-zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +define <2 x i8> @vslide1up_2xi8(<2 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_2xi8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret + %vb = insertelement <2 x i8> poison, i8 %b, i64 0 + %v1 = shufflevector <2 x i8> %v, <2 x i8> %vb, <2 x i32> + ret <2 x i8> %v1 +} + +define <4 x i8> @vslide1up_4xi8(<4 x i8> %v, i8 %b) { +; RV32-LABEL: vslide1up_4xi8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vslideup.vi v8, v9, 3 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1up_4xi8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vslideup.vi v8, v9, 3 +; RV64-NEXT: ret + %vb = insertelement <4 x i8> poison, i8 %b, i64 0 + %v1 = shufflevector <4 x i8> %v, <4 x i8> %vb, <4 x i32> + ret <4 x i8> %v1 +} + +define <4 x i8> @vslide1up_4xi8_swapped(<4 x i8> %v, i8 %b) { +; RV32-LABEL: vslide1up_4xi8_swapped: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vslideup.vi v8, v9, 3 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1up_4xi8_swapped: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vslideup.vi v8, v9, 3 +; RV64-NEXT: ret + %vb = insertelement <4 x i8> poison, i8 %b, i64 0 + %v1 = shufflevector <4 x i8> %vb, <4 x i8> %v, <4 x i32> + ret <4 x i8> %v1 +} + +define <2 x i16> @vslide1up_2xi16(<2 x i16> %v, i16 %b) { +; RV32-LABEL: vslide1up_2xi16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vslideup.vi v8, v9, 1 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1up_2xi16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vslideup.vi v8, v9, 1 +; RV64-NEXT: ret + %vb = insertelement <2 x i16> poison, i16 %b, i64 0 + %v1 = shufflevector <2 x i16> %v, <2 x i16> %vb, <2 x i32> + ret <2 x i16> %v1 +} + +define <4 x i16> @vslide1up_4xi16(<4 x i16> %v, i16 %b) { +; RV32-LABEL: vslide1up_4xi16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vslideup.vi v8, v9, 3 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1up_4xi16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vslideup.vi v8, v9, 3 +; RV64-NEXT: ret + %vb = insertelement <4 x i16> poison, i16 %b, i64 0 + %v1 = shufflevector <4 x i16> %v, <4 x i16> %vb, <4 x i32> + ret <4 x i16> %v1 +} + +define <2 x i32> @vslide1up_2xi32(<2 x i32> %v, i32 %b) { +; RV32-LABEL: vslide1up_2xi32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vslideup.vi v8, v9, 1 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1up_2xi32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vslideup.vi v8, v9, 1 +; RV64-NEXT: ret + %vb = insertelement <2 x i32> poison, i32 %b, i64 0 + %v1 = shufflevector <2 x i32> %v, <2 x i32> %vb, <2 x i32> + ret <2 x i32> %v1 +} + +define <4 x i32> @vslide1up_4xi32(<4 x i32> %v, i32 %b) { +; CHECK-LABEL: vslide1up_4xi32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: ret + %vb = insertelement <4 x i32> poison, i32 %b, i64 0 + %v1 = shufflevector <4 x i32> %v, <4 x i32> %vb, <4 x i32> + ret <4 x i32> %v1 +} + +define <2 x i64> @vslide1up_2xi64(<2 x i64> %v, i64 %b) { +; RV32-LABEL: vslide1up_2xi64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vslideup.vi v8, v9, 1 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1up_2xi64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vslideup.vi v8, v9, 1 +; RV64-NEXT: ret + %vb = insertelement <2 x i64> poison, i64 %b, i64 0 + %v1 = shufflevector <2 x i64> %v, <2 x i64> %vb, <2 x i32> + ret <2 x i64> %v1 +} + +define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) { +; RV32-LABEL: vslide1up_4xi64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vslideup.vi v8, v10, 3 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1up_4xi64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vslideup.vi v8, v10, 3 +; RV64-NEXT: ret + %vb = insertelement <4 x i64> poison, i64 %b, i64 0 + %v1 = shufflevector <4 x i64> %v, <4 x i64> %vb, <4 x i32> + ret <4 x i64> %v1 +} + +define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) { +; CHECK-LABEL: vslide1up_2xf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret + %vb = insertelement <2 x half> poison, half %b, i64 0 + %v1 = shufflevector <2 x half> %v, <2 x half> %vb, <2 x i32> + ret <2 x half> %v1 +} + +define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) { +; CHECK-LABEL: vslide1up_4xf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: ret + %vb = insertelement <4 x half> poison, half %b, i64 0 + %v1 = shufflevector <4 x half> %v, <4 x half> %vb, <4 x i32> + ret <4 x half> %v1 +} + +define <2 x float> @vslide1up_2xf32(<2 x float> %v, float %b) { +; CHECK-LABEL: vslide1up_2xf32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret + %vb = insertelement <2 x float> poison, float %b, i64 0 + %v1 = shufflevector <2 x float> %v, <2 x float> %vb, <2 x i32> + ret <2 x float> %v1 +} + +define <4 x float> @vslide1up_4xf32(<4 x float> %v, float %b) { +; CHECK-LABEL: vslide1up_4xf32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: ret + %vb = insertelement <4 x float> poison, float %b, i64 0 + %v1 = shufflevector <4 x float> %v, <4 x float> %vb, <4 x i32> + ret <4 x float> %v1 +} + +define <2 x double> @vslide1up_2xf64(<2 x double> %v, double %b) { +; CHECK-LABEL: vslide1up_2xf64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ret + %vb = insertelement <2 x double> poison, double %b, i64 0 + %v1 = shufflevector <2 x double> %v, <2 x double> %vb, <2 x i32> + ret <2 x double> %v1 +} + +define <4 x double> @vslide1up_4xf64(<4 x double> %v, double %b) { +; CHECK-LABEL: vslide1up_4xf64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vslideup.vi v8, v10, 3 +; CHECK-NEXT: ret + %vb = insertelement <4 x double> poison, double %b, i64 0 + %v1 = shufflevector <4 x double> %v, <4 x double> %vb, <4 x i32> + ret <4 x double> %v1 +} + +define <4 x i8> @vslide1up_4xi8_with_splat(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_4xi8_with_splat: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 7 +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vadd.vi v10, v9, 1 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vrgather.vv v9, v8, v10, v0.t +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %vb = insertelement <4 x i8> poison, i8 %b, i64 0 + %v1 = shufflevector <4 x i8> %vb, <4 x i8> poison, <4 x i32> zeroinitializer + %v2 = shufflevector <4 x i8> %v1, <4 x i8> %v, <4 x i32> + ret <4 x i8> %v2 +} + +define <2 x double> @vslide1up_v2f64_inverted(<2 x double> %v, double %b) { +; CHECK-LABEL: vslide1up_v2f64_inverted: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vrgather.vi v9, v8, 0 +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret + %v1 = shufflevector <2 x double> %v, <2 x double> poison, <2 x i32> + %v2 = insertelement <2 x double> %v1, double %b, i64 1 + ret <2 x double> %v2 +} + +define <4 x i8> @vslide1up_4xi8_inverted(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_4xi8_inverted: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv.s.x v8, a0 +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %v1 = shufflevector <4 x i8> %v, <4 x i8> poison, <4 x i32> + %v2 = insertelement <4 x i8> %v1, i8 %b, i64 1 + ret <4 x i8> %v2 +} From 0bb23c58be03decb76f5c505bfc69b4ceaa09169 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 30 May 2023 10:32:24 -0700 Subject: [PATCH 115/704] [RISCV] Rename vslide1down tests (should have been part of 24172de) --- .../rvv/fixed-vector-shuffle-vslide1down.ll | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll index 67b729aad971b..febb7d0afd7ba 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1down.ll @@ -4,8 +4,8 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -define <2 x i8> @vslide1up_2xi8(<2 x i8> %v, i8 %b) { -; CHECK-LABEL: vslide1up_2xi8: +define <2 x i8> @vslide1down_2xi8(<2 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1down_2xi8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 @@ -17,8 +17,8 @@ define <2 x i8> @vslide1up_2xi8(<2 x i8> %v, i8 %b) { ret <2 x i8> %v1 } -define <4 x i8> @vslide1up_4xi8(<4 x i8> %v, i8 %b) { -; RV32-LABEL: vslide1up_4xi8: +define <4 x i8> @vslide1down_4xi8(<4 x i8> %v, i8 %b) { +; RV32-LABEL: vslide1down_4xi8: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-NEXT: vmv.s.x v9, a0 @@ -27,7 +27,7 @@ define <4 x i8> @vslide1up_4xi8(<4 x i8> %v, i8 %b) { ; RV32-NEXT: vslideup.vi v8, v9, 3 ; RV32-NEXT: ret ; -; RV64-LABEL: vslide1up_4xi8: +; RV64-LABEL: vslide1down_4xi8: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV64-NEXT: vmv.v.x v9, a0 @@ -39,8 +39,8 @@ define <4 x i8> @vslide1up_4xi8(<4 x i8> %v, i8 %b) { ret <4 x i8> %v1 } -define <4 x i8> @vslide1up_4xi8_swapped(<4 x i8> %v, i8 %b) { -; RV32-LABEL: vslide1up_4xi8_swapped: +define <4 x i8> @vslide1down_4xi8_swapped(<4 x i8> %v, i8 %b) { +; RV32-LABEL: vslide1down_4xi8_swapped: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-NEXT: vmv.s.x v9, a0 @@ -49,7 +49,7 @@ define <4 x i8> @vslide1up_4xi8_swapped(<4 x i8> %v, i8 %b) { ; RV32-NEXT: vslideup.vi v8, v9, 3 ; RV32-NEXT: ret ; -; RV64-LABEL: vslide1up_4xi8_swapped: +; RV64-LABEL: vslide1down_4xi8_swapped: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV64-NEXT: vmv.v.x v9, a0 @@ -61,8 +61,8 @@ define <4 x i8> @vslide1up_4xi8_swapped(<4 x i8> %v, i8 %b) { ret <4 x i8> %v1 } -define <2 x i16> @vslide1up_2xi16(<2 x i16> %v, i16 %b) { -; RV32-LABEL: vslide1up_2xi16: +define <2 x i16> @vslide1down_2xi16(<2 x i16> %v, i16 %b) { +; RV32-LABEL: vslide1down_2xi16: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-NEXT: vmv.s.x v9, a0 @@ -71,7 +71,7 @@ define <2 x i16> @vslide1up_2xi16(<2 x i16> %v, i16 %b) { ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: ret ; -; RV64-LABEL: vslide1up_2xi16: +; RV64-LABEL: vslide1down_2xi16: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; RV64-NEXT: vmv.v.x v9, a0 @@ -83,8 +83,8 @@ define <2 x i16> @vslide1up_2xi16(<2 x i16> %v, i16 %b) { ret <2 x i16> %v1 } -define <4 x i16> @vslide1up_4xi16(<4 x i16> %v, i16 %b) { -; RV32-LABEL: vslide1up_4xi16: +define <4 x i16> @vslide1down_4xi16(<4 x i16> %v, i16 %b) { +; RV32-LABEL: vslide1down_4xi16: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 @@ -92,7 +92,7 @@ define <4 x i16> @vslide1up_4xi16(<4 x i16> %v, i16 %b) { ; RV32-NEXT: vslideup.vi v8, v9, 3 ; RV32-NEXT: ret ; -; RV64-LABEL: vslide1up_4xi16: +; RV64-LABEL: vslide1down_4xi16: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-NEXT: vmv.s.x v9, a0 @@ -105,8 +105,8 @@ define <4 x i16> @vslide1up_4xi16(<4 x i16> %v, i16 %b) { ret <4 x i16> %v1 } -define <2 x i32> @vslide1up_2xi32(<2 x i32> %v, i32 %b) { -; RV32-LABEL: vslide1up_2xi32: +define <2 x i32> @vslide1down_2xi32(<2 x i32> %v, i32 %b) { +; RV32-LABEL: vslide1down_2xi32: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 @@ -114,7 +114,7 @@ define <2 x i32> @vslide1up_2xi32(<2 x i32> %v, i32 %b) { ; RV32-NEXT: vslideup.vi v8, v9, 1 ; RV32-NEXT: ret ; -; RV64-LABEL: vslide1up_2xi32: +; RV64-LABEL: vslide1down_2xi32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-NEXT: vmv.s.x v9, a0 @@ -127,8 +127,8 @@ define <2 x i32> @vslide1up_2xi32(<2 x i32> %v, i32 %b) { ret <2 x i32> %v1 } -define <4 x i32> @vslide1up_4xi32(<4 x i32> %v, i32 %b) { -; CHECK-LABEL: vslide1up_4xi32: +define <4 x i32> @vslide1down_4xi32(<4 x i32> %v, i32 %b) { +; CHECK-LABEL: vslide1down_4xi32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 @@ -140,8 +140,8 @@ define <4 x i32> @vslide1up_4xi32(<4 x i32> %v, i32 %b) { ret <4 x i32> %v1 } -define <2 x i64> @vslide1up_2xi64(<2 x i64> %v, i64 %b) { -; RV32-LABEL: vslide1up_2xi64: +define <2 x i64> @vslide1down_2xi64(<2 x i64> %v, i64 %b) { +; RV32-LABEL: vslide1down_2xi64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 @@ -155,7 +155,7 @@ define <2 x i64> @vslide1up_2xi64(<2 x i64> %v, i64 %b) { ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; -; RV64-LABEL: vslide1up_2xi64: +; RV64-LABEL: vslide1down_2xi64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.v.x v9, a0 @@ -167,8 +167,8 @@ define <2 x i64> @vslide1up_2xi64(<2 x i64> %v, i64 %b) { ret <2 x i64> %v1 } -define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) { -; RV32-LABEL: vslide1up_4xi64: +define <4 x i64> @vslide1down_4xi64(<4 x i64> %v, i64 %b) { +; RV32-LABEL: vslide1down_4xi64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 @@ -182,7 +182,7 @@ define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) { ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; -; RV64-LABEL: vslide1up_4xi64: +; RV64-LABEL: vslide1down_4xi64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-NEXT: vmv.v.x v10, a0 @@ -194,8 +194,8 @@ define <4 x i64> @vslide1up_4xi64(<4 x i64> %v, i64 %b) { ret <4 x i64> %v1 } -define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) { -; CHECK-LABEL: vslide1up_2xf16: +define <2 x half> @vslide1down_2xf16(<2 x half> %v, half %b) { +; CHECK-LABEL: vslide1down_2xf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 @@ -207,8 +207,8 @@ define <2 x half> @vslide1up_2xf16(<2 x half> %v, half %b) { ret <2 x half> %v1 } -define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) { -; CHECK-LABEL: vslide1up_4xf16: +define <4 x half> @vslide1down_4xf16(<4 x half> %v, half %b) { +; CHECK-LABEL: vslide1down_4xf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 @@ -220,8 +220,8 @@ define <4 x half> @vslide1up_4xf16(<4 x half> %v, half %b) { ret <4 x half> %v1 } -define <2 x float> @vslide1up_2xf32(<2 x float> %v, float %b) { -; CHECK-LABEL: vslide1up_2xf32: +define <2 x float> @vslide1down_2xf32(<2 x float> %v, float %b) { +; CHECK-LABEL: vslide1down_2xf32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 @@ -233,8 +233,8 @@ define <2 x float> @vslide1up_2xf32(<2 x float> %v, float %b) { ret <2 x float> %v1 } -define <4 x float> @vslide1up_4xf32(<4 x float> %v, float %b) { -; CHECK-LABEL: vslide1up_4xf32: +define <4 x float> @vslide1down_4xf32(<4 x float> %v, float %b) { +; CHECK-LABEL: vslide1down_4xf32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 @@ -246,8 +246,8 @@ define <4 x float> @vslide1up_4xf32(<4 x float> %v, float %b) { ret <4 x float> %v1 } -define <2 x double> @vslide1up_2xf64(<2 x double> %v, double %b) { -; CHECK-LABEL: vslide1up_2xf64: +define <2 x double> @vslide1down_2xf64(<2 x double> %v, double %b) { +; CHECK-LABEL: vslide1down_2xf64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 @@ -259,8 +259,8 @@ define <2 x double> @vslide1up_2xf64(<2 x double> %v, double %b) { ret <2 x double> %v1 } -define <4 x double> @vslide1up_4xf64(<4 x double> %v, double %b) { -; CHECK-LABEL: vslide1up_4xf64: +define <4 x double> @vslide1down_4xf64(<4 x double> %v, double %b) { +; CHECK-LABEL: vslide1down_4xf64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 @@ -272,8 +272,8 @@ define <4 x double> @vslide1up_4xf64(<4 x double> %v, double %b) { ret <4 x double> %v1 } -define <4 x i8> @vslide1up_4xi8_with_splat(<4 x i8> %v, i8 %b) { -; CHECK-LABEL: vslide1up_4xi8_with_splat: +define <4 x i8> @vslide1down_4xi8_with_splat(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1down_4xi8_with_splat: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 7 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu @@ -290,8 +290,8 @@ define <4 x i8> @vslide1up_4xi8_with_splat(<4 x i8> %v, i8 %b) { ret <4 x i8> %v2 } -define <2 x double> @vslide1up_v2f64_inverted(<2 x double> %v, double %b) { -; CHECK-LABEL: vslide1up_v2f64_inverted: +define <2 x double> @vslide1down_v2f64_inverted(<2 x double> %v, double %b) { +; CHECK-LABEL: vslide1down_v2f64_inverted: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vrgather.vi v9, v8, 0 @@ -304,8 +304,8 @@ define <2 x double> @vslide1up_v2f64_inverted(<2 x double> %v, double %b) { ret <2 x double> %v2 } -define <4 x i8> @vslide1up_4xi8_inverted(<4 x i8> %v, i8 %b) { -; CHECK-LABEL: vslide1up_4xi8_inverted: +define <4 x i8> @vslide1down_4xi8_inverted(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1down_4xi8_inverted: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vslideup.vi v9, v8, 1 From 2922e7cd9334797c24a317d41275f1258ef9ddd3 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 30 May 2023 10:38:16 -0700 Subject: [PATCH 116/704] Re-enable MLIR test-contraction.mlir integration test after LLVM backend crash was fixed --- mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir index 315c99ba915f1..579dc86cad55b 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/test-contraction.mlir @@ -2,8 +2,7 @@ // RUN: mlir-cpu-runner -e entry -entry-point-result=void \ // RUN: -shared-libs=%mlir_c_runner_utils | \ // RUN: FileCheck %s -// Disable the execution for now because of LLVM backend bug: https://github.com/llvm/llvm-project/issues/62995 -// UNSUPPORTED: target={{.*}} + #dotp_accesses = [ affine_map<(i) -> (i)>, affine_map<(i) -> (i)>, From 6cdc07a701eec08da450be58d6e1b67428a983dd Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Thu, 25 May 2023 14:17:32 -0700 Subject: [PATCH 117/704] [CUDA] correctly install cuda_wrappers/bits/shared_ptr_base.h The file must go under cuda_wrappers/bits/ directly, but was by mistake copied directly into cuda_wrappers/ during installation. Differential Revision: https://reviews.llvm.org/D151503 --- clang/lib/Headers/CMakeLists.txt | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index d41443f0b285c..bbd3d0f2d719a 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -267,6 +267,9 @@ set(cuda_wrapper_files cuda_wrappers/cmath cuda_wrappers/complex cuda_wrappers/new +) + +set(cuda_wrapper_bits_files cuda_wrappers/bits/shared_ptr_base.h ) @@ -328,7 +331,8 @@ endfunction(clang_generate_header) # Copy header files from the source directory to the build directory -foreach( f ${files} ${cuda_wrapper_files} ${ppc_wrapper_files} ${openmp_wrapper_files} ${hlsl_files}) +foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files} + ${ppc_wrapper_files} ${openmp_wrapper_files} ${hlsl_files}) copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f}) endforeach( f ) @@ -432,7 +436,7 @@ add_header_target("arm-common-resource-headers" "${arm_common_files};${arm_commo # Architecture/platform specific targets add_header_target("arm-resource-headers" "${arm_only_files};${arm_only_generated_files}") add_header_target("aarch64-resource-headers" "${aarch64_only_files};${aarch64_only_generated_files}") -add_header_target("cuda-resource-headers" "${cuda_files};${cuda_wrapper_files}") +add_header_target("cuda-resource-headers" "${cuda_files};${cuda_wrapper_files};${cuda_wrapper_bits_files}") add_header_target("hexagon-resource-headers" "${hexagon_files}") add_header_target("hip-resource-headers" "${hip_files}") add_header_target("loongarch-resource-headers" "${loongarch_files}") @@ -466,6 +470,11 @@ install( DESTINATION ${header_install_dir}/cuda_wrappers COMPONENT clang-resource-headers) +install( + FILES ${cuda_wrapper_bits_files} + DESTINATION ${header_install_dir}/cuda_wrappers/bits + COMPONENT clang-resource-headers) + install( FILES ${ppc_wrapper_files} DESTINATION ${header_install_dir}/ppc_wrappers @@ -508,6 +517,12 @@ install( EXCLUDE_FROM_ALL COMPONENT cuda-resource-headers) +install( + FILES ${cuda_wrapper_bits_files} + DESTINATION ${header_install_dir}/cuda_wrappers/bits + EXCLUDE_FROM_ALL + COMPONENT cuda-resource-headers) + install( FILES ${cuda_files} DESTINATION ${header_install_dir} From 6219b7c61a942fb8b6d931b4aac021d293cdde4d Mon Sep 17 00:00:00 2001 From: Deniz Evrenci Date: Tue, 30 May 2023 16:48:28 +0000 Subject: [PATCH 118/704] [clang-tidy] Do not emit bugprone-exception-escape warnings from coroutines All exceptions thrown in coroutine bodies are caught and unhandled_exception member of the coroutine promise type is called. In accordance with the existing rules of diagnostics related to exceptions thrown in functions marked noexcept, even if the promise type's constructor, get_return_object, or unhandled_exception throws, diagnostics should not be emitted. Fixes #61905. Reviewed By: PiotrZSL, ChuanqiXu Differential Revision: https://reviews.llvm.org/D147417 --- .../clang-tidy/utils/ExceptionAnalyzer.cpp | 13 + clang-tools-extra/docs/ReleaseNotes.rst | 4 + .../bugprone/exception-escape-coro.cpp | 711 ++++++++++++++++++ clang/include/clang/AST/StmtCXX.h | 11 + 4 files changed, 739 insertions(+) create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-coro.cpp diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp index c862303706ccb..690e771414a75 100644 --- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp +++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp @@ -523,6 +523,19 @@ ExceptionAnalyzer::ExceptionInfo ExceptionAnalyzer::throwsException( ExceptionInfo Excs = throwsException(DefaultInit->getExpr(), Caught, CallStack); Results.merge(Excs); + } else if (const auto *Coro = dyn_cast(St)) { + for (const Stmt *Child : Coro->childrenExclBody()) { + ExceptionInfo Excs = throwsException(Child, Caught, CallStack); + Results.merge(Excs); + } + ExceptionInfo Excs = throwsException(Coro->getBody(), Caught, CallStack); + for (const Type *Throwable : Excs.getExceptionTypes()) { + if (const auto ThrowableRec = Throwable->getAsCXXRecordDecl()) { + ExceptionInfo DestructorExcs = + throwsException(ThrowableRec->getDestructor(), CallStack); + Results.merge(DestructorExcs); + } + } } else { for (const Stmt *Child : St->children()) { ExceptionInfo Excs = throwsException(Child, Caught, CallStack); diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 1eb8c5ba4b71b..b336cd2fc0e24 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -395,6 +395,10 @@ Changes in existing checks `: warn on ``const &&`` constructors. +- Fixed :doc:`bugprone-exception-escape` + for coroutines where previously a warning would be emitted with coroutines + throwing exceptions in their bodies. + Removed checks ^^^^^^^^^^^^^^ diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-coro.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-coro.cpp new file mode 100644 index 0000000000000..9caafe7676f4e --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-coro.cpp @@ -0,0 +1,711 @@ +// RUN: %check_clang_tidy -std=c++20 %s bugprone-exception-escape %t -- \ +// RUN: -- -fexceptions + +namespace std { + +template struct coroutine_traits { + using promise_type = typename Ret::promise_type; +}; + +template struct coroutine_handle { + static coroutine_handle from_address(void *) noexcept; + static coroutine_handle from_promise(Promise &promise); + constexpr void *address() const noexcept; +}; + +template <> struct coroutine_handle { + template + coroutine_handle(coroutine_handle) noexcept; + static coroutine_handle from_address(void *); + constexpr void *address() const noexcept; +}; + +struct suspend_always { + bool await_ready() noexcept { return false; } + void await_suspend(coroutine_handle<>) noexcept {} + void await_resume() noexcept {} +}; + +struct suspend_never { + bool await_ready() noexcept { return true; } + void await_suspend(coroutine_handle<>) noexcept {} + void await_resume() noexcept {} +}; + +} // namespace std + +template +struct Promise; + +template < + typename T, bool ThrowInTaskConstructor = false, + bool ThrowInPromiseConstructor = false, bool ThrowInInitialSuspend = false, + bool ThrowInGetReturnObject = false, bool ThrowInUnhandledException = false> +struct Task { + using promise_type = + Promise; + + explicit Task(promise_type &p) { + if constexpr (ThrowInTaskConstructor) { + throw 1; + } + + p.return_val = this; + } + + bool await_ready() { return true; } + + void await_suspend(std::coroutine_handle<> h) {} + + void await_resume() {} + + T value; +}; + +template +struct Task { + using promise_type = + Promise; + + explicit Task(promise_type &p) { + if constexpr (ThrowInTaskConstructor) { + throw 1; + } + + p.return_val = this; + } + + bool await_ready() { return true; } + + void await_suspend(std::coroutine_handle<> h) {} + + void await_resume() {} +}; + +template +struct Promise { + Promise() { + if constexpr (ThrowInPromiseConstructor) { + throw 1; + } + } + + Task get_return_object() { + if constexpr (ThrowInGetReturnObject) { + throw 1; + } + + return Task{*this}; + } + + std::suspend_never initial_suspend() const { + if constexpr (ThrowInInitialSuspend) { + throw 1; + } + + return {}; + } + + std::suspend_never final_suspend() const noexcept { return {}; } + + template void return_value(U &&val) { + return_val->value = static_cast(val); + } + + template std::suspend_never yield_value(U &&val) { + return_val->value = static_cast(val); + return {}; + } + + void unhandled_exception() { + if constexpr (ThrowInUnhandledException) { + throw 1; + } + } + + Task *return_val; +}; + +template +struct Promise { + Promise() { + if constexpr (ThrowInPromiseConstructor) { + throw 1; + } + } + + Task get_return_object() { + if constexpr (ThrowInGetReturnObject) { + throw 1; + } + + return Task{*this}; + } + + std::suspend_never initial_suspend() const { + if constexpr (ThrowInInitialSuspend) { + throw 1; + } + + return {}; + } + + std::suspend_never final_suspend() const noexcept { return {}; } + + void return_void() {} + + void unhandled_exception() { + if constexpr (ThrowInUnhandledException) { + throw 1; + } + } + + Task *return_val; +}; + +struct Evil { + ~Evil() noexcept(false) { + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: an exception may be thrown in function '~Evil' which should not throw exceptions + throw 42; + } +}; + +Task returnOne() { co_return 1; } + +namespace function { + +namespace coreturn { + +Task a_ShouldNotDiag(const int a, const int b) { + if (b == 0) + throw b; + + co_return a / b; +} + +Task b_ShouldNotDiag(const int a, const int b) noexcept { + if (b == 0) + throw b; + + co_return a / b; +} + +Task c_ShouldNotDiag(const int a, const int b) { + if (b == 0) + throw Evil{}; + + co_return a / b; +} + +Task c_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: an exception may be thrown in function 'c_ShouldDiag' which should not throw exceptions + if (b == 0) + throw Evil{}; + + co_return a / b; +} + +Task d_ShouldNotDiag(const int a, const int b) { + co_return a / b; +} + +Task d_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: an exception may be thrown in function 'd_ShouldDiag' which should not throw exceptions + co_return a / b; +} + +Task e_ShouldNotDiag(const int a, const int b) { + co_return a / b; +} + +Task e_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: an exception may be thrown in function 'e_ShouldDiag' which should not throw exceptions + co_return a / b; +} + +Task f_ShouldNotDiag(const int a, const int b) { + co_return a / b; +} + +Task f_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: an exception may be thrown in function 'f_ShouldDiag' which should not throw exceptions + co_return a / b; +} + +Task g_ShouldNotDiag(const int a, const int b) { + co_return a / b; +} + +Task g_ShouldDiag(const int a, + const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-2]]:38: warning: an exception may be thrown in function 'g_ShouldDiag' which should not throw exceptions + co_return a / b; +} + +Task h_ShouldNotDiag(const int a, + const int b) { + co_return a / b; +} + +Task h_ShouldDiag(const int a, + const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-2]]:45: warning: an exception may be thrown in function 'h_ShouldDiag' which should not throw exceptions + co_return a / b; +} + +} // namespace coreturn + +namespace coyield { + +Task a_ShouldNotDiag(const int a, const int b) { + if (b == 0) + throw b; + + co_yield a / b; +} + +Task b_ShouldNotDiag(const int a, const int b) noexcept { + if (b == 0) + throw b; + + co_yield a / b; +} + +Task c_ShouldNotDiag(const int a, const int b) { + if (b == 0) + throw Evil{}; + + co_yield a / b; +} + +Task c_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: an exception may be thrown in function 'c_ShouldDiag' which should not throw exceptions + if (b == 0) + throw Evil{}; + + co_yield a / b; +} + +Task d_ShouldNotDiag(const int a, const int b) { + co_yield a / b; +} + +Task d_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: an exception may be thrown in function 'd_ShouldDiag' which should not throw exceptions + co_yield a / b; +} + +Task e_ShouldNotDiag(const int a, const int b) { + co_yield a / b; +} + +Task e_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: an exception may be thrown in function 'e_ShouldDiag' which should not throw exceptions + co_yield a / b; +} + +Task f_ShouldNotDiag(const int a, const int b) { + co_yield a / b; +} + +Task f_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: an exception may be thrown in function 'f_ShouldDiag' which should not throw exceptions + co_yield a / b; +} + +Task g_ShouldNotDiag(const int a, const int b) { + co_yield a / b; +} + +Task g_ShouldDiag(const int a, + const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-2]]:38: warning: an exception may be thrown in function 'g_ShouldDiag' which should not throw exceptions + co_yield a / b; +} + +Task h_ShouldNotDiag(const int a, + const int b) { + co_yield a / b; +} + +Task h_ShouldDiag(const int a, + const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-2]]:45: warning: an exception may be thrown in function 'h_ShouldDiag' which should not throw exceptions + co_yield a / b; +} + +} // namespace coyield + +namespace coawait { + +Task a_ShouldNotDiag(const int a, const int b) { + if (b == 0) + throw b; + + co_await returnOne(); +} + +Task b_ShouldNotDiag(const int a, const int b) noexcept { + if (b == 0) + throw b; + + co_await returnOne(); +} + +Task c_ShouldNotDiag(const int a, const int b) { + if (b == 0) + throw Evil{}; + + co_await returnOne(); +} + +Task c_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: an exception may be thrown in function 'c_ShouldDiag' which should not throw exceptions + if (b == 0) + throw Evil{}; + + co_await returnOne(); +} + +Task d_ShouldNotDiag(const int a, const int b) { + co_await returnOne(); +} + +Task d_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: an exception may be thrown in function 'd_ShouldDiag' which should not throw exceptions + co_await returnOne(); +} + +Task e_ShouldNotDiag(const int a, const int b) { + co_await returnOne(); +} + +Task e_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: an exception may be thrown in function 'e_ShouldDiag' which should not throw exceptions + co_await returnOne(); +} + +Task f_ShouldNotDiag(const int a, const int b) { + co_await returnOne(); +} + +Task f_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:32: warning: an exception may be thrown in function 'f_ShouldDiag' which should not throw exceptions + co_await returnOne(); +} + +Task g_ShouldNotDiag(const int a, + const int b) { + co_await returnOne(); +} + +Task g_ShouldDiag(const int a, + const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-2]]:39: warning: an exception may be thrown in function 'g_ShouldDiag' which should not throw exceptions + co_await returnOne(); +} + +Task h_ShouldNotDiag(const int a, + const int b) { + co_await returnOne(); +} + +Task +h_ShouldDiag(const int a, const int b) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:1: warning: an exception may be thrown in function 'h_ShouldDiag' which should not throw exceptions + co_await returnOne(); +} + +} // namespace coawait + +} // namespace function + +namespace lambda { + +namespace coreturn { + +const auto a_ShouldNotDiag = [](const int a, const int b) -> Task { + if (b == 0) + throw b; + + co_return a / b; +}; + +const auto b_ShouldNotDiag = [](const int a, + const int b) noexcept -> Task { + if (b == 0) + throw b; + + co_return a / b; +}; + +const auto c_ShouldNotDiag = [](const int a, const int b) -> Task { + if (b == 0) + throw Evil{}; + + co_return a / b; +}; + +const auto c_ShouldDiag = [](const int a, const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + if (b == 0) + throw Evil{}; + + co_return a / b; +}; + +const auto d_ShouldNotDiag = [](const int a, const int b) -> Task { + co_return a / b; +}; + +const auto d_ShouldDiag = [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_return a / b; +}; + +const auto e_ShouldNotDiag = [](const int a, + const int b) -> Task { + co_return a / b; +}; + +const auto e_ShouldDiag = [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_return a / b; +}; + +const auto f_ShouldNotDiag = [](const int a, + const int b) -> Task { + co_return a / b; +}; + +const auto f_ShouldDiag = + [](const int a, const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_return a / b; +}; + +const auto g_ShouldNotDiag = + [](const int a, const int b) -> Task { + co_return a / b; +}; + +const auto g_ShouldDiag = + [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_return a / b; +}; + +const auto h_ShouldNotDiag = + [](const int a, + const int b) -> Task { + co_return a / b; +}; + +const auto h_ShouldDiag = + [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_return a / b; +}; + +} // namespace coreturn + +namespace coyield { + +const auto a_ShouldNotDiag = [](const int a, const int b) -> Task { + if (b == 0) + throw b; + + co_yield a / b; +}; + +const auto b_ShouldNotDiag = [](const int a, + const int b) noexcept -> Task { + if (b == 0) + throw b; + + co_yield a / b; +}; + +const auto c_ShouldNotDiag = [](const int a, const int b) -> Task { + if (b == 0) + throw Evil{}; + + co_yield a / b; +}; + +const auto c_ShouldDiag = [](const int a, const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + if (b == 0) + throw Evil{}; + + co_yield a / b; +}; + +const auto d_ShouldNotDiag = [](const int a, const int b) -> Task { + co_yield a / b; +}; + +const auto d_ShouldDiag = [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_yield a / b; +}; + +const auto e_ShouldNotDiag = [](const int a, + const int b) -> Task { + co_yield a / b; +}; + +const auto e_ShouldDiag = [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_yield a / b; +}; + +const auto f_ShouldNotDiag = [](const int a, + const int b) -> Task { + co_yield a / b; +}; + +const auto f_ShouldDiag = + [](const int a, const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_yield a / b; +}; + +const auto g_ShouldNotDiag = + [](const int a, const int b) -> Task { + co_yield a / b; +}; + +const auto g_ShouldDiag = + [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_yield a / b; +}; + +const auto h_ShouldNotDiag = + [](const int a, + const int b) -> Task { + co_yield a / b; +}; + +const auto h_ShouldDiag = + [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_yield a / b; +}; + +} // namespace coyield + +namespace coawait { + +const auto a_ShouldNotDiag = [](const int a, const int b) -> Task { + if (b == 0) + throw b; + + co_await returnOne(); +}; + +const auto b_ShouldNotDiag = [](const int a, + const int b) noexcept -> Task { + if (b == 0) + throw b; + + co_await returnOne(); +}; + +const auto c_ShouldNotDiag = [](const int a, const int b) -> Task { + if (b == 0) + throw Evil{}; + + co_await returnOne(); +}; + +const auto c_ShouldDiag = [](const int a, const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + if (b == 0) + throw Evil{}; + + co_await returnOne(); +}; + +const auto d_ShouldNotDiag = [](const int a, const int b) -> Task { + co_await returnOne(); +}; + +const auto d_ShouldDiag = [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_await returnOne(); +}; + +const auto e_ShouldNotDiag = [](const int a, + const int b) -> Task { + co_await returnOne(); +}; + +const auto e_ShouldDiag = [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_await returnOne(); +}; + +const auto f_ShouldNotDiag = [](const int a, + const int b) -> Task { + co_await returnOne(); +}; + +const auto f_ShouldDiag = + [](const int a, const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_await returnOne(); +}; + +const auto g_ShouldNotDiag = + [](const int a, const int b) -> Task { + co_await returnOne(); +}; + +const auto g_ShouldDiag = + [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_await returnOne(); +}; + +const auto h_ShouldNotDiag = + [](const int a, + const int b) -> Task { + co_await returnOne(); +}; + +const auto h_ShouldDiag = + [](const int a, + const int b) noexcept -> Task { + // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: an exception may be thrown in function 'operator()' which should not throw exceptions + co_await returnOne(); +}; + +} // namespace coawait + +} // namespace lambda diff --git a/clang/include/clang/AST/StmtCXX.h b/clang/include/clang/AST/StmtCXX.h index 60fc3f3a63f49..8b4ef24ed376a 100644 --- a/clang/include/clang/AST/StmtCXX.h +++ b/clang/include/clang/AST/StmtCXX.h @@ -443,6 +443,17 @@ class CoroutineBodyStmt final NumParams); } + child_range childrenExclBody() { + return child_range(getStoredStmts() + SubStmt::Body + 1, + getStoredStmts() + SubStmt::FirstParamMove + NumParams); + } + + const_child_range childrenExclBody() const { + return const_child_range(getStoredStmts() + SubStmt::Body + 1, + getStoredStmts() + SubStmt::FirstParamMove + + NumParams); + } + static bool classof(const Stmt *T) { return T->getStmtClass() == CoroutineBodyStmtClass; } From b07d08bb8590b2689f6dc5fbea1ab32b703fcff6 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 30 May 2023 10:49:28 -0700 Subject: [PATCH 119/704] [RISCV] Add additional vslide1up test coverage Add another form of the same pattern (as_rotate tests), and add coverage for a couple corner cases I got wrong at first in an upcoming rewrite. --- .../rvv/fixed-vector-shuffle-vslide1up.ll | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll index 21fb38643bf2c..19154c6a7a70d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-shuffle-vslide1up.ll @@ -304,6 +304,35 @@ define <4 x i8> @vslide1up_4xi8_inverted(<4 x i8> %v, i8 %b) { ret <4 x i8> %v2 } +define <2 x double> @vslide1up_2xf64_as_rotate(<2 x double> %v, double %b) { +; CHECK-LABEL: vslide1up_2xf64_as_rotate: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret + %v1 = insertelement <2 x double> %v, double %b, i64 1 + %v2 = shufflevector <2 x double> %v1, <2 x double> poison, <2 x i32> + ret <2 x double> %v2 +} + +define <4 x i8> @vslide1up_4xi8_as_rotate(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_4xi8_as_rotate: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vslideup.vi v8, v9, 3 +; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %v1 = insertelement <4 x i8> %v, i8 %b, i64 3 + %v2 = shufflevector <4 x i8> %v1, <4 x i8> poison, <4 x i32> + ret <4 x i8> %v2 +} ; The length of the shift is less than the suffix, since we'd have to ; materailize the splat, using the vslide1up doesn't help us. @@ -333,3 +362,55 @@ define <4 x i32> @vslide1up_4xi32_neg2(<4 x i32> %v1, <4 x i32> %v2) { %res = shufflevector <4 x i32> %v1, <4 x i32> %v2, <4 x i32> ret <4 x i32> %res } + +; Not profitable - can just use a slideup instead +define <4 x i8> @vslide1up_4xi8_neg_undef_insert(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_4xi8_neg_undef_insert: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %v2 = shufflevector <4 x i8> poison, <4 x i8> %v, <4 x i32> + ret <4 x i8> %v2 +} + +define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_4xi8_neg_incorrect_insert: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI23_0) +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %v2 = shufflevector <4 x i8> poison, <4 x i8> %v, <4 x i32> + ret <4 x i8> %v2 +} + +define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert2(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_4xi8_neg_incorrect_insert2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %v2 = shufflevector <4 x i8> poison, <4 x i8> %v, <4 x i32> + ret <4 x i8> %v2 +} + +define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert3(<4 x i8> %v, i8 %b) { +; CHECK-LABEL: vslide1up_4xi8_neg_incorrect_insert3: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI25_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI25_0) +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret + %v2 = shufflevector <4 x i8> poison, <4 x i8> %v, <4 x i32> + ret <4 x i8> %v2 +} From b75f9ce3fe861473e36ea6715d82b6954ea2b815 Mon Sep 17 00:00:00 2001 From: Mats Petersson Date: Mon, 22 May 2023 15:48:01 +0100 Subject: [PATCH 120/704] [FLANG] Support all arrays for LoopVersioning This patch makes more than 2D arrays work, with a fix for the way that loop index is calculated. Removing the restriction of number of dimensions. This also changes the way that the actual index is calculated, such that the stride is used rather than the extent of the previous dimension. Some tests failed without fixing this - this was likely a latent bug in the 2D version too, but found in a test using 3D arrays, so wouldn't have been found with 2D only. This introduces a division on the index calculation - however it should be a nice and constant value allowing a shift to be used to actually divide - or otherwise removed by using other methods to calculate the result. In analysing code generated with optimisation at -O3, there are no divides produced. Some minor refactoring to avoid repeatedly asking for the "rank" of the array being worked on. This improves some of the SPEC-2017 ROMS code, in the same way as the limited 2D array improvements - less overhead spent calculating array indices in the inner-most loop and better use of vector-instructions. Reviewed By: kiranchandramohan Differential Revision: https://reviews.llvm.org/D151140 --- .../Optimizer/Transforms/LoopVersioning.cpp | 69 ++++---- flang/test/Transforms/loop-versioning.fir | 150 +++++++++++++++++- 2 files changed, 182 insertions(+), 37 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp index 4695d39ee41cb..f1588d2f622a1 100644 --- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp +++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp @@ -73,7 +73,6 @@ namespace { class LoopVersioningPass : public fir::impl::LoopVersioningBase { - public: void runOnOperation() override; }; @@ -105,6 +104,7 @@ void LoopVersioningPass::runOnOperation() { struct ArgInfo { mlir::Value *arg; size_t size; + unsigned rank; fir::BoxDimsOp dims[CFI_MAX_RANK]; }; @@ -114,13 +114,11 @@ void LoopVersioningPass::runOnOperation() { mlir::Block::BlockArgListType args = func.getArguments(); mlir::ModuleOp module = func->getParentOfType(); fir::KindMapping kindMap = fir::getKindMapping(module); - mlir::SmallVector argsOfInterest; + mlir::SmallVector argsOfInterest; for (auto &arg : args) { if (auto seqTy = getAsSequenceType(&arg)) { unsigned rank = seqTy.getDimension(); - // Currently limited to 1D or 2D arrays as that seems to give good - // improvement without excessive increase in code-size, etc. - if (rank > 0 && rank < 3 && + if (rank > 0 && seqTy.getShape()[0] == fir::SequenceType::getUnknownExtent()) { size_t typeSize = 0; mlir::Type elementType = fir::unwrapSeqOrBoxedSeqType(arg.getType()); @@ -130,12 +128,9 @@ void LoopVersioningPass::runOnOperation() { else if (auto cty = elementType.dyn_cast()) typeSize = 2 * cty.getEleType(kindMap).getIntOrFloatBitWidth() / 8; if (typeSize) - argsOfInterest.push_back({&arg, typeSize, {}}); + argsOfInterest.push_back({&arg, typeSize, rank, {}}); else LLVM_DEBUG(llvm::dbgs() << "Type not supported\n"); - - } else { - LLVM_DEBUG(llvm::dbgs() << "Too many dimensions\n"); } } } @@ -145,14 +140,14 @@ void LoopVersioningPass::runOnOperation() { struct OpsWithArgs { mlir::Operation *op; - mlir::SmallVector argsAndDims; + mlir::SmallVector argsAndDims; }; // Now see if those arguments are used inside any loop. mlir::SmallVector loopsOfInterest; func.walk([&](fir::DoLoopOp loop) { mlir::Block &body = *loop.getBody(); - mlir::SmallVector argsInLoop; + mlir::SmallVector argsInLoop; body.walk([&](fir::CoordinateOp op) { // The current operation could be inside another loop than // the one we're currently processing. Skip it, we'll get @@ -199,16 +194,16 @@ void LoopVersioningPass::runOnOperation() { mlir::Value allCompares = nullptr; // Ensure all of the arrays are unit-stride. for (auto &arg : op.argsAndDims) { - - fir::SequenceType seqTy = getAsSequenceType(arg.arg); - unsigned rank = seqTy.getDimension(); - - // We only care about lowest order dimension. - for (unsigned i = 0; i < rank; i++) { + // Fetch all the dimensions of the array, except the last dimension. + // Always fetch the first dimension, however, so set ndims = 1 if + // we have one dim + unsigned ndims = arg.rank; + for (unsigned i = 0; i < ndims; i++) { mlir::Value dimIdx = builder.createIntegerConstant(loc, idxTy, i); arg.dims[i] = builder.create(loc, idxTy, idxTy, idxTy, *arg.arg, dimIdx); } + // We only care about lowest order dimension, here. mlir::Value elemSize = builder.createIntegerConstant(loc, idxTy, arg.size); mlir::Value cmp = builder.create( @@ -245,25 +240,41 @@ void LoopVersioningPass::runOnOperation() { // Reduce the multi-dimensioned index to a single index. // This is required becase fir arrays do not support multiple dimensions // with unknown dimensions at compile time. + // We then calculate the multidimensional array like this: + // arr(x, y, z) bedcomes arr(z * stride(2) + y * stride(1) + x) + // where stride is the distance between elements in the dimensions + // 0, 1 and 2 or x, y and z. if (coop->getOperand(0) == *arg.arg && coop->getOperands().size() >= 2) { builder.setInsertionPoint(coop); - mlir::Value totalIndex = builder.createIntegerConstant(loc, idxTy, 0); - // Operand(1) = array; Operand(2) = index1; Operand(3) = index2 - for (unsigned i = coop->getOperands().size() - 1; i > 1; i--) { + mlir::Value totalIndex; + for (unsigned i = arg.rank - 1; i > 0; i--) { + // Operand(1) = array; Operand(2) = index1; Operand(3) = index2 mlir::Value curIndex = - builder.createConvert(loc, idxTy, coop->getOperand(i)); - // First arg is Operand2, so dims[i-2] is 0-based i-1! + builder.createConvert(loc, idxTy, coop->getOperand(i + 1)); + // Multiply by the stride of this array. Later we'll divide by the + // element size. mlir::Value scale = - builder.createConvert(loc, idxTy, arg.dims[i - 2].getResult(1)); + builder.createConvert(loc, idxTy, arg.dims[i].getResult(2)); + curIndex = + builder.create(loc, scale, curIndex); + totalIndex = (totalIndex) ? builder.create( + loc, curIndex, totalIndex) + : curIndex; + } + mlir::Value elemSize = + builder.createIntegerConstant(loc, idxTy, arg.size); + // This is the lowest dimension - which doesn't need scaling + mlir::Value finalIndex = + builder.createConvert(loc, idxTy, coop->getOperand(1)); + if (totalIndex) { totalIndex = builder.create( - loc, totalIndex, - builder.create(loc, scale, curIndex)); + loc, + builder.create(loc, totalIndex, elemSize), + finalIndex); + } else { + totalIndex = finalIndex; } - totalIndex = builder.create( - loc, totalIndex, - builder.createConvert(loc, idxTy, coop->getOperand(1))); - auto newOp = builder.create( loc, builder.getRefType(elementType), caddr, mlir::ValueRange{totalIndex}); diff --git a/flang/test/Transforms/loop-versioning.fir b/flang/test/Transforms/loop-versioning.fir index bad02b7a226c5..3c8930ccbde47 100644 --- a/flang/test/Transforms/loop-versioning.fir +++ b/flang/test/Transforms/loop-versioning.fir @@ -156,8 +156,7 @@ func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, // CHECK: %[[CONV:.*]] = fir.convert %[[Y]] : {{.*}} // CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]] : {{.*}} // CHECK: fir.do_loop %[[INDEX:.*]] = {{.*}} -// CHECK: %[[IND_PLUS_1:.*]] = arith.addi %{{.*}}, %[[INDEX]] -// CHECK: %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[IND_PLUS_1]] +// CHECK: %[[YADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %[[INDEX]] // CHECK: %[[YINT:.*]] = fir.load %[[YADDR]] : {{.*}} // CHECK: %[[YINDEX:.*]] = fir.convert %[[YINT]] // CHECK: %[[XADDR:.*]] = fir.array_coor %[[X]] [%{{.*}}] %[[YINDEX]] @@ -269,7 +268,7 @@ func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, // CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[CONV]] // CHECK: %[[RES:.*]] = fir.do_loop {{.*}} { // CHECK: %[[ADDR:.*]] = fir.coordinate_of %[[BOX_ADDR]], %{{.*}} -// CHECK: %45 = fir.load %[[ADDR]] : !fir.ref +// CHECK: %{{.*}} = fir.load %[[ADDR]] : !fir.ref // CHECK: } // CHECK: fir.result %[[RES]] : {{.*}} // CHECK: } else { @@ -355,19 +354,22 @@ func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, // Only inner loop should be verisoned. // CHECK: fir.do_loop // CHECK: %[[ZERO:.*]] = arith.constant 0 : index -// CHECK: %[[DIMS:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}} +// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}} +// CHECK: %[[ONE:.*]] = arith.constant 1 : index +// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}} // CHECK: %[[SIZE:.*]] = arith.constant 8 : index -// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS]]#2, %[[SIZE]] +// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]] // CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}} // CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]] // CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref> // CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}} // Check the 2D -> 1D coordinate conversion, should have a multiply and a final add. // Some other operations are checked to synch the different parts. -// CHECK: arith.muli %[[DIMS]]#1, {{.*}} -// CHECK: %[[OUTER_IDX:.*]] = arith.addi {{.*}} +// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}} +// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index // CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}} -// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_IDX]], %[[INNER_IDX]] +// CHECK: %[[OUTER_DIV:.*]] = arith.divsi %[[OUTER_IDX]], %[[ITEMSIZE]] +// CHECK: %[[C2D:.*]] = arith.addi %[[OUTER_DIV]], %[[INNER_IDX]] // CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C2D]] : (!fir.ref>, index) -> !fir.ref // CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref // CHECK: fir.result %{{.*}}, %{{.*}} @@ -384,4 +386,136 @@ func.func @sum1dfixed(%arg0: !fir.ref> {fir.bindc_name = "a"}, // CHECK: fir.store %[[IF_RES]]#1 to %{{.*}} // CHECK: return +// ----- + +// subroutine sum3d(a, nx, ny, nz) +// real*8 :: a(:, :, :) +// integer :: nx, ny, nz +// real*8 :: sum +// integer :: i, j, k +// sum = 0 +// do k=1,nz +// do j=1,ny +// do i=0,nx +// sum = sum + a(i, j, k) +// end do +// end do +// end do +// end subroutine sum3d + + + func.func @sum3d(%arg0: !fir.box> {fir.bindc_name = "a"}, %arg1: !fir.ref {fir.bindc_name = "nx"}, %arg2: !fir.ref {fir.bindc_name = "ny"}, %arg3: !fir.ref {fir.bindc_name = "nz"}) { + %0 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmoduleFsum3dEi"} + %1 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmoduleFsum3dEj"} + %2 = fir.alloca i32 {bindc_name = "k", uniq_name = "_QMmoduleFsum3dEk"} + %3 = fir.alloca f64 {bindc_name = "sum", uniq_name = "_QMmoduleFsum3dEsum"} + %cst = arith.constant 0.000000e+00 : f64 + fir.store %cst to %3 : !fir.ref + %c1_i32 = arith.constant 1 : i32 + %4 = fir.convert %c1_i32 : (i32) -> index + %5 = fir.load %arg3 : !fir.ref + %6 = fir.convert %5 : (i32) -> index + %c1 = arith.constant 1 : index + %7 = fir.convert %4 : (index) -> i32 + %8:2 = fir.do_loop %arg4 = %4 to %6 step %c1 iter_args(%arg5 = %7) -> (index, i32) { + fir.store %arg5 to %2 : !fir.ref + %c1_i32_0 = arith.constant 1 : i32 + %9 = fir.convert %c1_i32_0 : (i32) -> index + %10 = fir.load %arg2 : !fir.ref + %11 = fir.convert %10 : (i32) -> index + %c1_1 = arith.constant 1 : index + %12 = fir.convert %9 : (index) -> i32 + %13:2 = fir.do_loop %arg6 = %9 to %11 step %c1_1 iter_args(%arg7 = %12) -> (index, i32) { + fir.store %arg7 to %1 : !fir.ref + %c0_i32 = arith.constant 0 : i32 + %18 = fir.convert %c0_i32 : (i32) -> index + %19 = fir.load %arg1 : !fir.ref + %20 = fir.convert %19 : (i32) -> index + %c1_2 = arith.constant 1 : index + %21 = fir.convert %18 : (index) -> i32 + %22:2 = fir.do_loop %arg8 = %18 to %20 step %c1_2 iter_args(%arg9 = %21) -> (index, i32) { + fir.store %arg9 to %0 : !fir.ref + %27 = fir.load %3 : !fir.ref + %28 = fir.load %0 : !fir.ref + %29 = fir.convert %28 : (i32) -> i64 + %c1_i64 = arith.constant 1 : i64 + %30 = arith.subi %29, %c1_i64 : i64 + %31 = fir.load %1 : !fir.ref + %32 = fir.convert %31 : (i32) -> i64 + %c1_i64_3 = arith.constant 1 : i64 + %33 = arith.subi %32, %c1_i64_3 : i64 + %34 = fir.load %2 : !fir.ref + %35 = fir.convert %34 : (i32) -> i64 + %c1_i64_4 = arith.constant 1 : i64 + %36 = arith.subi %35, %c1_i64_4 : i64 + %37 = fir.coordinate_of %arg0, %30, %33, %36 : (!fir.box>, i64, i64, i64) -> !fir.ref + %38 = fir.load %37 : !fir.ref + %39 = arith.addf %27, %38 fastmath : f64 + fir.store %39 to %3 : !fir.ref + %40 = arith.addi %arg8, %c1_2 : index + %41 = fir.convert %c1_2 : (index) -> i32 + %42 = fir.load %0 : !fir.ref + %43 = arith.addi %42, %41 : i32 + fir.result %40, %43 : index, i32 + } + fir.store %22#1 to %0 : !fir.ref + %23 = arith.addi %arg6, %c1_1 : index + %24 = fir.convert %c1_1 : (index) -> i32 + %25 = fir.load %1 : !fir.ref + %26 = arith.addi %25, %24 : i32 + fir.result %23, %26 : index, i32 + } + fir.store %13#1 to %1 : !fir.ref + %14 = arith.addi %arg4, %c1 : index + %15 = fir.convert %c1 : (index) -> i32 + %16 = fir.load %2 : !fir.ref + %17 = arith.addi %16, %15 : i32 + fir.result %14, %17 : index, i32 + } + fir.store %8#1 to %2 : !fir.ref + return + } + +// Note this only checks the expected transformation, not the entire generated code: +// CHECK-LABEL: func.func @sum3d( +// CHECK-SAME: %[[ARG0:.*]]: !fir.box> {{.*}}) +// Only inner loop should be verisoned. +// CHECK: fir.do_loop +// CHECK: %[[ZERO:.*]] = arith.constant 0 : index +// CHECK: %[[DIMS0:.*]]:3 = fir.box_dims %[[ARG0]], %[[ZERO]] : {{.*}} +// CHECK: %[[ONE:.*]] = arith.constant 1 : index +// CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[ARG0]], %[[ONE]] : {{.*}} +// CHECK: %[[TWO:.*]] = arith.constant 2 : index +// CHECK: %[[DIMS2:.*]]:3 = fir.box_dims %[[ARG0]], %[[TWO]] : {{.*}} +// CHECK: %[[SIZE:.*]] = arith.constant 8 : index +// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[DIMS0]]#2, %[[SIZE]] +// CHECK: %[[IF_RES:.*]]:2 = fir.if %[[CMP]] -> {{.*}} +// CHECK: %[[NEWARR:.*]] = fir.convert %[[ARG0]] +// CHECK: %[[BOXADDR:.*]] = fir.box_addr %[[NEWARR]] : {{.*}} -> !fir.ref> +// CHECK: %[[LOOP_RES:.*]]:2 = fir.do_loop {{.*}} +// Check the 3D -> 1D coordinate conversion, should have a multiply and a final add. +// Some other operations are checked to synch the different parts. +// CHECK: %[[OUTER_IDX:.*]] = arith.muli %[[DIMS2]]#2, {{.*}} +// CHECK: %[[MIDDLE_IDX:.*]] = arith.muli %[[DIMS1]]#2, {{.*}} +// CHECK: %[[MIDDLE_SUM:.*]] = arith.addi %[[MIDDLE_IDX]], %[[OUTER_IDX]] +// CHECK: %[[ITEMSIZE:.*]] = arith.constant 8 : index +// CHECK: %[[INNER_IDX:.*]] = fir.convert {{.*}} +// CHECK: %[[MIDDLE_DIV:.*]] = arith.divsi %[[MIDDLE_SUM]], %[[ITEMSIZE]] +// CHECK: %[[C3D:.*]] = arith.addi %[[MIDDLE_DIV]], %[[INNER_IDX]] +// CHECK: %[[COORD:.*]] = fir.coordinate_of %[[BOXADDR]], %[[C3D]] : (!fir.ref>, index) -> !fir.ref +// CHECK: %{{.*}} = fir.load %[[COORD]] : !fir.ref +// CHECK: fir.result %{{.*}}, %{{.*}} +// CHECK: } +// CHECK fir.result %[[LOOP_RES]]#0, %[[LOOP_RES]]#1 +// CHECK: } else { +// CHECK: %[[LOOP_RES2:.*]]:2 = fir.do_loop {{.*}} +// CHECK: %[[COORD2:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.box>, i64, i64, i64) -> !fir.ref +// CHECK: %{{.*}}= fir.load %[[COORD2]] : !fir.ref +// CHECK: fir.result %{{.*}}, %{{.*}} +// CHECK: } +// CHECK fir.result %[[LOOP_RES2]]#0, %[[LOOP_RES2]]#1 +// CHECK: } +// CHECK: fir.store %[[IF_RES]]#1 to %{{.*}} +// CHECK: return + } // End module From bd9940a809e3e355a640a9ec45b0c666faec33be Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 30 May 2023 10:58:06 -0700 Subject: [PATCH 121/704] [Driver][test] Properly test -mdefault-visibility-export-mapping= --- clang/test/CodeGen/mdefault-visibility-export-mapping.c | 4 ---- clang/test/Driver/mdefault-visibility-export-mapping.c | 7 +++++++ 2 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 clang/test/Driver/mdefault-visibility-export-mapping.c diff --git a/clang/test/CodeGen/mdefault-visibility-export-mapping.c b/clang/test/CodeGen/mdefault-visibility-export-mapping.c index 18c6302d9f655..7f39050907fff 100644 --- a/clang/test/CodeGen/mdefault-visibility-export-mapping.c +++ b/clang/test/CodeGen/mdefault-visibility-export-mapping.c @@ -9,10 +9,6 @@ // RUN: %clang -target powerpc-ibm-aix %s -mdefault-visibility-export-mapping=all -fvisibility=hidden -S -emit-llvm -o - | \ // RUN: FileCheck -check-prefixes=UNSPECIFIED-HID,EXPLICIT-EXP %s -// RUN: not %clang -mdefault-visibility-export-mapping=explicit -target powerpc-unknown-linux %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ERROR %s -// ERROR: unsupported option '-mdefault-visibility-export-mapping=explicit' for target 'powerpc-unknown-linux' - // UNSPECIFIED-DEF: define void @func() // UNSPECIFIED-HID: define hidden void @func() // UNSPECIFIED-EXP: define dllexport void @func() diff --git a/clang/test/Driver/mdefault-visibility-export-mapping.c b/clang/test/Driver/mdefault-visibility-export-mapping.c new file mode 100644 index 0000000000000..506149b897e04 --- /dev/null +++ b/clang/test/Driver/mdefault-visibility-export-mapping.c @@ -0,0 +1,7 @@ +// RUN: %clang -### -S -mdefault-visibility-export-mapping=explicit --target=powerpc-ibm-aix %s 2>&1 | FileCheck %s +// RUN: %clang -### -S -mdefault-visibility-export-mapping=explicit --target=powerpc-unknown-linux %s 2>&1 | \ +// RUN: FileCheck -check-prefix=ERROR %s + +// CHECK: "-mdefault-visibility-export-mapping=explicit" + +// ERROR: unsupported option '-mdefault-visibility-export-mapping=explicit' for target 'powerpc-unknown-linux' From 06ff9770477d8c7378047b0171db4b25eba5d8dd Mon Sep 17 00:00:00 2001 From: Erick Velez Date: Tue, 30 May 2023 18:58:13 +0100 Subject: [PATCH 122/704] [clang][ExtractAPI] Refactor serializer to the CRTP Refactor SerializerBase and SymbolGraphSerializer to use a visitor pattern described by the CRTP. Reviewed By: dang Differential Revision: https://reviews.llvm.org/D151477 --- .../ExtractAPI/Serialization/SerializerBase.h | 118 +++++++++++++----- .../Serialization/SymbolGraphSerializer.h | 68 +++++----- clang/lib/ExtractAPI/CMakeLists.txt | 1 - .../Serialization/SerializerBase.cpp | 19 --- .../Serialization/SymbolGraphSerializer.cpp | 77 +++--------- 5 files changed, 149 insertions(+), 134 deletions(-) delete mode 100644 clang/lib/ExtractAPI/Serialization/SerializerBase.cpp diff --git a/clang/include/clang/ExtractAPI/Serialization/SerializerBase.h b/clang/include/clang/ExtractAPI/Serialization/SerializerBase.h index d8aa826e3f4f6..006e92be29555 100644 --- a/clang/include/clang/ExtractAPI/Serialization/SerializerBase.h +++ b/clang/include/clang/ExtractAPI/Serialization/SerializerBase.h @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file defines the ExtractAPI APISerializer interface. +/// This file defines the ExtractAPI APISetVisitor interface. /// //===----------------------------------------------------------------------===// @@ -15,47 +15,107 @@ #define LLVM_CLANG_EXTRACTAPI_SERIALIZATION_SERIALIZERBASE_H #include "clang/ExtractAPI/API.h" -#include "clang/ExtractAPI/APIIgnoresList.h" -#include "llvm/Support/raw_ostream.h" namespace clang { namespace extractapi { -/// Common options to customize the serializer output. -struct APISerializerOption { - /// Do not include unnecessary whitespaces to save space. - bool Compact; -}; - -/// The base interface of serializers for API information. -class APISerializer { +/// The base interface of visitors for API information. +template class APISetVisitor { public: - /// Serialize the API information to \p os. - virtual void serialize(raw_ostream &os) = 0; + void traverseAPISet() { + getDerived()->traverseGlobalVariableRecords(); -protected: - const APISet &API; + getDerived()->traverseGlobalFunctionRecords(); + + getDerived()->traverseEnumRecords(); + + getDerived()->traverseStructRecords(); + + getDerived()->traverseObjCInterfaces(); + + getDerived()->traverseObjCProtocols(); + + getDerived()->traverseMacroDefinitionRecords(); + + getDerived()->traverseTypedefRecords(); + } + + void traverseGlobalFunctionRecords() { + for (const auto &GlobalFunction : API.getGlobalFunctions()) + getDerived()->visitGlobalFunctionRecord(*GlobalFunction.second); + } + + void traverseGlobalVariableRecords() { + for (const auto &GlobalVariable : API.getGlobalVariables()) + getDerived()->visitGlobalVariableRecord(*GlobalVariable.second); + } + + void traverseEnumRecords() { + for (const auto &Enum : API.getEnums()) + getDerived()->visitEnumRecord(*Enum.second); + } - /// The list of symbols to ignore. - /// - /// Note: This should be consulted before emitting a symbol. - const APIIgnoresList &IgnoresList; + void traverseStructRecords() { + for (const auto &Struct : API.getStructs()) + getDerived()->visitStructRecord(*Struct.second); + } - APISerializerOption Options; + void traverseObjCInterfaces() { + for (const auto &Interface : API.getObjCInterfaces()) + getDerived()->visitObjCContainerRecord(*Interface.second); + } + + void traverseObjCProtocols() { + for (const auto &Protocol : API.getObjCProtocols()) + getDerived()->visitObjCContainerRecord(*Protocol.second); + } + + void traverseMacroDefinitionRecords() { + for (const auto &Macro : API.getMacros()) + getDerived()->visitMacroDefinitionRecord(*Macro.second); + } + + void traverseTypedefRecords() { + for (const auto &Typedef : API.getTypedefs()) + getDerived()->visitTypedefRecord(*Typedef.second); + } + + /// Visit a global function record. + void visitGlobalFunctionRecord(const GlobalFunctionRecord &Record){}; + + /// Visit a global variable record. + void visitGlobalVariableRecord(const GlobalVariableRecord &Record){}; + + /// Visit an enum record. + void visitEnumRecord(const EnumRecord &Record){}; + + /// Visit a struct record. + void visitStructRecord(const StructRecord &Record){}; + + /// Visit an Objective-C container record. + void visitObjCContainerRecord(const ObjCContainerRecord &Record){}; + + /// Visit a macro definition record. + void visitMacroDefinitionRecord(const MacroDefinitionRecord &Record){}; + + /// Visit a typedef record. + void visitTypedefRecord(const TypedefRecord &Record){}; + +protected: + const APISet &API; public: - APISerializer() = delete; - APISerializer(const APISerializer &) = delete; - APISerializer(APISerializer &&) = delete; - APISerializer &operator=(const APISerializer &) = delete; - APISerializer &operator=(APISerializer &&) = delete; + APISetVisitor() = delete; + APISetVisitor(const APISetVisitor &) = delete; + APISetVisitor(APISetVisitor &&) = delete; + APISetVisitor &operator=(const APISetVisitor &) = delete; + APISetVisitor &operator=(APISetVisitor &&) = delete; protected: - APISerializer(const APISet &API, const APIIgnoresList &IgnoresList, - APISerializerOption Options = {}) - : API(API), IgnoresList(IgnoresList), Options(Options) {} + APISetVisitor(const APISet &API) : API(API) {} + ~APISetVisitor() = default; - virtual ~APISerializer() = default; + Derived *getDerived() { return static_cast(this); }; }; } // namespace extractapi diff --git a/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h b/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h index 6639082bbf332..e77903f8ba08f 100644 --- a/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h +++ b/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h @@ -9,8 +9,8 @@ /// \file /// This file defines the SymbolGraphSerializer class. /// -/// Implement an APISerializer for the Symbol Graph format for ExtractAPI. -/// See https://github.com/apple/swift-docc-symbolkit. +/// Implement an APISetVisitor to serialize the APISet into the Symbol Graph +/// format for ExtractAPI. See https://github.com/apple/swift-docc-symbolkit. /// //===----------------------------------------------------------------------===// @@ -31,14 +31,18 @@ namespace extractapi { using namespace llvm::json; -/// The serializer that organizes API information in the Symbol Graph format. +/// Common options to customize the visitor output. +struct SymbolGraphSerializerOption { + /// Do not include unnecessary whitespaces to save space. + bool Compact; +}; + +/// The visitor that organizes API information in the Symbol Graph format. /// /// The Symbol Graph format (https://github.com/apple/swift-docc-symbolkit) /// models an API set as a directed graph, where nodes are symbol declarations, /// and edges are relationships between the connected symbols. -class SymbolGraphSerializer : public APISerializer { - virtual void anchor(); - +class SymbolGraphSerializer : public APISetVisitor { /// A JSON array of formatted symbols in \c APISet. Array Symbols; @@ -48,7 +52,7 @@ class SymbolGraphSerializer : public APISerializer { /// The Symbol Graph format version used by this serializer. static const VersionTuple FormatVersion; - /// Indicates whether child symbols should be serialized. This is mainly + /// Indicates whether child symbols should be visited. This is mainly /// useful for \c serializeSingleSymbolSGF. bool ShouldRecurse; @@ -59,9 +63,8 @@ class SymbolGraphSerializer : public APISerializer { /// Symbol Graph. Object serialize(); - /// Implement the APISerializer::serialize interface. Wrap serialize(void) and - /// write out the serialized JSON object to \p os. - void serialize(raw_ostream &os) override; + /// Wrap serialize(void) and write out the serialized JSON object to \p os. + void serialize(raw_ostream &os); /// Serialize a single symbol SGF. This is primarily used for libclang. /// @@ -136,35 +139,44 @@ class SymbolGraphSerializer : public APISerializer { void serializeRelationship(RelationshipKind Kind, SymbolReference Source, SymbolReference Target); - /// Serialize a global function record. - void serializeGlobalFunctionRecord(const GlobalFunctionRecord &Record); +protected: + /// The list of symbols to ignore. + /// + /// Note: This should be consulted before emitting a symbol. + const APIIgnoresList &IgnoresList; + + SymbolGraphSerializerOption Options; + +public: + /// Visit a global function record. + void visitGlobalFunctionRecord(const GlobalFunctionRecord &Record); - /// Serialize a global variable record. - void serializeGlobalVariableRecord(const GlobalVariableRecord &Record); + /// Visit a global variable record. + void visitGlobalVariableRecord(const GlobalVariableRecord &Record); - /// Serialize an enum record. - void serializeEnumRecord(const EnumRecord &Record); + /// Visit an enum record. + void visitEnumRecord(const EnumRecord &Record); - /// Serialize a struct record. - void serializeStructRecord(const StructRecord &Record); + /// Visit a struct record. + void visitStructRecord(const StructRecord &Record); - /// Serialize an Objective-C container record. - void serializeObjCContainerRecord(const ObjCContainerRecord &Record); + /// Visit an Objective-C container record. + void visitObjCContainerRecord(const ObjCContainerRecord &Record); - /// Serialize a macro definition record. - void serializeMacroDefinitionRecord(const MacroDefinitionRecord &Record); + /// Visit a macro definition record. + void visitMacroDefinitionRecord(const MacroDefinitionRecord &Record); - /// Serialize a typedef record. - void serializeTypedefRecord(const TypedefRecord &Record); + /// Visit a typedef record. + void visitTypedefRecord(const TypedefRecord &Record); + /// Serialize a single record. void serializeSingleRecord(const APIRecord *Record); -public: SymbolGraphSerializer(const APISet &API, const APIIgnoresList &IgnoresList, - APISerializerOption Options = {}, + SymbolGraphSerializerOption Options = {}, bool ShouldRecurse = true) - : APISerializer(API, IgnoresList, Options), ShouldRecurse(ShouldRecurse) { - } + : APISetVisitor(API), ShouldRecurse(ShouldRecurse), + IgnoresList(IgnoresList), Options(Options) {} }; } // namespace extractapi diff --git a/clang/lib/ExtractAPI/CMakeLists.txt b/clang/lib/ExtractAPI/CMakeLists.txt index 153d4b992fda7..b43fe742478ce 100644 --- a/clang/lib/ExtractAPI/CMakeLists.txt +++ b/clang/lib/ExtractAPI/CMakeLists.txt @@ -9,7 +9,6 @@ add_clang_library(clangExtractAPI AvailabilityInfo.cpp ExtractAPIConsumer.cpp DeclarationFragments.cpp - Serialization/SerializerBase.cpp Serialization/SymbolGraphSerializer.cpp TypedefUnderlyingTypeResolver.cpp diff --git a/clang/lib/ExtractAPI/Serialization/SerializerBase.cpp b/clang/lib/ExtractAPI/Serialization/SerializerBase.cpp deleted file mode 100644 index 71fd25b2b2abb..0000000000000 --- a/clang/lib/ExtractAPI/Serialization/SerializerBase.cpp +++ /dev/null @@ -1,19 +0,0 @@ -//===- ExtractAPI/Serialization/SerializerBase.cpp --------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file implements the APISerializer interface. -/// -//===----------------------------------------------------------------------===// - -#include "clang/ExtractAPI/Serialization/SerializerBase.h" -#include "llvm/Support/raw_ostream.h" - -using namespace clang::extractapi; - -void APISerializer::serialize(llvm::raw_ostream &os) {} diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp index 7676c74af6869..534e9288cc713 100644 --- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp +++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp @@ -14,16 +14,11 @@ #include "clang/ExtractAPI/Serialization/SymbolGraphSerializer.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/Version.h" -#include "clang/ExtractAPI/API.h" -#include "clang/ExtractAPI/APIIgnoresList.h" #include "clang/ExtractAPI/DeclarationFragments.h" -#include "clang/ExtractAPI/Serialization/SerializerBase.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/STLFunctionalExtras.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" -#include "llvm/Support/JSON.h" #include "llvm/Support/Path.h" #include "llvm/Support/VersionTuple.h" #include @@ -541,19 +536,16 @@ template Array generateParentContexts(const RecordTy &Record, const APISet &API, Language Lang) { Array ParentContexts; - generatePathComponents(Record, API, - [Lang, &ParentContexts](const PathComponent &PC) { - ParentContexts.push_back( - serializeParentContext(PC, Lang)); - }); + generatePathComponents( + Record, API, [Lang, &ParentContexts](const PathComponent &PC) { + ParentContexts.push_back(serializeParentContext(PC, Lang)); + }); return ParentContexts; } } // namespace -void SymbolGraphSerializer::anchor() {} - /// Defines the format version emitted by SymbolGraphSerializer. const VersionTuple SymbolGraphSerializer::FormatVersion{0, 5, 3}; @@ -670,7 +662,7 @@ void SymbolGraphSerializer::serializeRelationship(RelationshipKind Kind, Relationships.emplace_back(std::move(Relationship)); } -void SymbolGraphSerializer::serializeGlobalFunctionRecord( +void SymbolGraphSerializer::visitGlobalFunctionRecord( const GlobalFunctionRecord &Record) { auto Obj = serializeAPIRecord(Record); if (!Obj) @@ -679,7 +671,7 @@ void SymbolGraphSerializer::serializeGlobalFunctionRecord( Symbols.emplace_back(std::move(*Obj)); } -void SymbolGraphSerializer::serializeGlobalVariableRecord( +void SymbolGraphSerializer::visitGlobalVariableRecord( const GlobalVariableRecord &Record) { auto Obj = serializeAPIRecord(Record); if (!Obj) @@ -688,7 +680,7 @@ void SymbolGraphSerializer::serializeGlobalVariableRecord( Symbols.emplace_back(std::move(*Obj)); } -void SymbolGraphSerializer::serializeEnumRecord(const EnumRecord &Record) { +void SymbolGraphSerializer::visitEnumRecord(const EnumRecord &Record) { auto Enum = serializeAPIRecord(Record); if (!Enum) return; @@ -697,7 +689,7 @@ void SymbolGraphSerializer::serializeEnumRecord(const EnumRecord &Record) { serializeMembers(Record, Record.Constants); } -void SymbolGraphSerializer::serializeStructRecord(const StructRecord &Record) { +void SymbolGraphSerializer::visitStructRecord(const StructRecord &Record) { auto Struct = serializeAPIRecord(Record); if (!Struct) return; @@ -706,7 +698,7 @@ void SymbolGraphSerializer::serializeStructRecord(const StructRecord &Record) { serializeMembers(Record, Record.Fields); } -void SymbolGraphSerializer::serializeObjCContainerRecord( +void SymbolGraphSerializer::visitObjCContainerRecord( const ObjCContainerRecord &Record) { auto ObjCContainer = serializeAPIRecord(Record); if (!ObjCContainer) @@ -743,7 +735,7 @@ void SymbolGraphSerializer::serializeObjCContainerRecord( } } -void SymbolGraphSerializer::serializeMacroDefinitionRecord( +void SymbolGraphSerializer::visitMacroDefinitionRecord( const MacroDefinitionRecord &Record) { auto Macro = serializeAPIRecord(Record); @@ -758,28 +750,28 @@ void SymbolGraphSerializer::serializeSingleRecord(const APIRecord *Record) { case APIRecord::RK_Unknown: llvm_unreachable("Records should have a known kind!"); case APIRecord::RK_GlobalFunction: - serializeGlobalFunctionRecord(*cast(Record)); + visitGlobalFunctionRecord(*cast(Record)); break; case APIRecord::RK_GlobalVariable: - serializeGlobalVariableRecord(*cast(Record)); + visitGlobalVariableRecord(*cast(Record)); break; case APIRecord::RK_Enum: - serializeEnumRecord(*cast(Record)); + visitEnumRecord(*cast(Record)); break; case APIRecord::RK_Struct: - serializeStructRecord(*cast(Record)); + visitStructRecord(*cast(Record)); break; case APIRecord::RK_ObjCInterface: - serializeObjCContainerRecord(*cast(Record)); + visitObjCContainerRecord(*cast(Record)); break; case APIRecord::RK_ObjCProtocol: - serializeObjCContainerRecord(*cast(Record)); + visitObjCContainerRecord(*cast(Record)); break; case APIRecord::RK_MacroDefinition: - serializeMacroDefinitionRecord(*cast(Record)); + visitMacroDefinitionRecord(*cast(Record)); break; case APIRecord::RK_Typedef: - serializeTypedefRecord(*cast(Record)); + visitTypedefRecord(*cast(Record)); break; default: if (auto Obj = serializeAPIRecord(*Record)) { @@ -793,8 +785,7 @@ void SymbolGraphSerializer::serializeSingleRecord(const APIRecord *Record) { } } -void SymbolGraphSerializer::serializeTypedefRecord( - const TypedefRecord &Record) { +void SymbolGraphSerializer::visitTypedefRecord(const TypedefRecord &Record) { // Typedefs of anonymous types have their entries unified with the underlying // type. bool ShouldDrop = Record.UnderlyingType.Name.empty(); @@ -814,35 +805,7 @@ void SymbolGraphSerializer::serializeTypedefRecord( } Object SymbolGraphSerializer::serialize() { - // Serialize global variables in the API set. - for (const auto &GlobalVar : API.getGlobalVariables()) - serializeGlobalVariableRecord(*GlobalVar.second); - - for (const auto &GlobalFunction : API.getGlobalFunctions()) - serializeGlobalFunctionRecord(*GlobalFunction.second); - - // Serialize enum records in the API set. - for (const auto &Enum : API.getEnums()) - serializeEnumRecord(*Enum.second); - - // Serialize struct records in the API set. - for (const auto &Struct : API.getStructs()) - serializeStructRecord(*Struct.second); - - // Serialize Objective-C interface records in the API set. - for (const auto &ObjCInterface : API.getObjCInterfaces()) - serializeObjCContainerRecord(*ObjCInterface.second); - - // Serialize Objective-C protocol records in the API set. - for (const auto &ObjCProtocol : API.getObjCProtocols()) - serializeObjCContainerRecord(*ObjCProtocol.second); - - for (const auto &Macro : API.getMacros()) - serializeMacroDefinitionRecord(*Macro.second); - - for (const auto &Typedef : API.getTypedefs()) - serializeTypedefRecord(*Typedef.second); - + traverseAPISet(); return serializeCurrentGraph(); } From 520362b28db51f6374f2f91e9d8e98bb555d847f Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 30 May 2023 18:02:40 +0000 Subject: [PATCH 123/704] [gn build] Port 06ff9770477d --- llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn index cd0b98129699c..94ab7206f3649 100644 --- a/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/ExtractAPI/BUILD.gn @@ -15,7 +15,6 @@ static_library("ExtractAPI") { "AvailabilityInfo.cpp", "DeclarationFragments.cpp", "ExtractAPIConsumer.cpp", - "Serialization/SerializerBase.cpp", "Serialization/SymbolGraphSerializer.cpp", "TypedefUnderlyingTypeResolver.cpp", ] From e557b8a14247ad58ec20ff82612de852155f77ad Mon Sep 17 00:00:00 2001 From: Tue Ly Date: Mon, 29 May 2023 23:08:20 -0400 Subject: [PATCH 124/704] [libc][RISCV] Add log, log2, log1p, log10 for RISC-V64 entrypoints. Add log, log2, log1p, log10 RISCV64 entrypoints. Reviewed By: michaelrj, sivachandra Differential Revision: https://reviews.llvm.org/D151674 --- libc/config/linux/riscv64/entrypoints.txt | 4 ++++ libc/docs/math/index.rst | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/libc/config/linux/riscv64/entrypoints.txt b/libc/config/linux/riscv64/entrypoints.txt index c36a623701e5c..85e1364066f08 100644 --- a/libc/config/linux/riscv64/entrypoints.txt +++ b/libc/config/linux/riscv64/entrypoints.txt @@ -286,9 +286,13 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.llround libc.src.math.llroundf libc.src.math.llroundl + libc.src.math.log10 libc.src.math.log10f + libc.src.math.log1p libc.src.math.log1pf + libc.src.math.log2 libc.src.math.log2f + libc.src.math.log libc.src.math.logf libc.src.math.logb libc.src.math.logbf diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 40f18862849bc..cd9909ceda660 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -394,25 +394,25 @@ Higher Math Functions +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | lgammal | | | | | | | | | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| log | |check| | |check| | | | |check| | | | |check| | | | | | +| log | |check| | |check| | | |check| | |check| | | | |check| | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | logf | |check| | |check| | | |check| | |check| | | | |check| | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | logl | | | | | | | | | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| log10 | |check| | |check| | | | |check| | | | |check| | | | | | +| log10 | |check| | |check| | | |check| | |check| | | | |check| | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | log10f | |check| | |check| | | |check| | |check| | | | |check| | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | log10l | | | | | | | | | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| log1p | |check| | |check| | | | |check| | | | |check| | | | | | +| log1p | |check| | |check| | | |check| | |check| | | | |check| | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | log1pf | |check| | |check| | | |check| | |check| | | | |check| | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | log1pl | | | | | | | | | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ -| log2 | |check| | |check| | | | |check| | | | |check| | | | | | +| log2 | |check| | |check| | | |check| | |check| | | | |check| | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ | log2f | |check| | |check| | | |check| | |check| | | | |check| | | | | | +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+ From a2a51448c1c34ee1cd1c3ec3689e7a354bb0df7c Mon Sep 17 00:00:00 2001 From: Alex Brachet Date: Tue, 30 May 2023 18:19:00 +0000 Subject: [PATCH 125/704] [compiler-rt][bultins] Fix libatomic standalone build Differential Revision: https://reviews.llvm.org/D151679 --- compiler-rt/lib/builtins/CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index c3e22a8f354fc..66d11938d38ac 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -861,19 +861,17 @@ option(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC if(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC) add_custom_target(builtins-standalone-atomic) - set(BUILTIN_DEPS "") set(BUILTIN_TYPE SHARED) if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") if(NOT COMPILER_RT_LIBATOMIC_LINK_FLAGS) get_aix_libatomic_default_link_flags(COMPILER_RT_LIBATOMIC_LINK_FLAGS "${CMAKE_CURRENT_SOURCE_DIR}/ppc/atomic.exp") endif() - # The compiler needs builtins to link any other binaries, so let - # clang_rt.atomic be built after builtins. - set(BUILTIN_DEPS builtins) # For different versions of cmake, SHARED behaves differently. For some # versions, we might need MODULE rather than SHARED. get_aix_libatomic_type(BUILTIN_TYPE) + else() + list(APPEND COMPILER_RT_LIBATOMIC_LINK_FLAGS -nodefaultlibs) endif() foreach (arch ${BUILTIN_SUPPORTED_ARCH}) if(CAN_TARGET_${arch}) @@ -882,7 +880,7 @@ if(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC) ARCHS ${arch} SOURCES atomic.c LINK_FLAGS ${COMPILER_RT_LIBATOMIC_LINK_FLAGS} - DEPS ${BUILTIN_DEPS} + DEPS builtins PARENT_TARGET builtins-standalone-atomic) endif() endforeach() From fbea5aada14315da14c2e296831b1cb1cc1ddd61 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 30 May 2023 11:21:17 -0700 Subject: [PATCH 126/704] [Driver] Add ClangFlags::TargetSpecific to simplify err_drv_unsupported_opt_for_target processing clang/lib/Driver/ToolChains/Clang.cpp has a lot of fragments like the following: ``` if (const Arg *A = Args.getLastArg(...)) { if (Triple is xxx) A->render(Args, CmdArgs); else D.Diag(diag::err_drv_unsupported_opt_for_target) << ...; } ``` The problem is more apparent with a recent surge of AIX-specific options. Introduce the TargetSpecific flag so that we can move the target-specific options to ToolChains/*.cpp and ToolChains/Arch/*.cpp and overload the warn_drv_unused_argument mechanism to give an err_drv_unsupported_opt_for_target error. Migrate -march=/-mcpu= and some AIX-specific options to use this simplified pattern. Reviewed By: jansvoboda11 Differential Revision: https://reviews.llvm.org/D151590 --- clang/include/clang/Driver/Options.h | 1 + clang/include/clang/Driver/Options.td | 18 ++++++---- clang/lib/Driver/Driver.cpp | 12 +++++-- clang/lib/Driver/ToolChains/AIX.cpp | 12 +++++++ clang/lib/Driver/ToolChains/AIX.h | 4 +++ clang/lib/Driver/ToolChains/Arch/PPC.cpp | 4 --- clang/lib/Driver/ToolChains/Arch/Sparc.cpp | 6 ---- clang/lib/Driver/ToolChains/Clang.cpp | 34 ------------------- clang/lib/Driver/ToolChains/CommonArgs.cpp | 6 ++-- .../mdefault-visibility-export-mapping.c | 2 +- 10 files changed, 42 insertions(+), 57 deletions(-) diff --git a/clang/include/clang/Driver/Options.h b/clang/include/clang/Driver/Options.h index f7ee154b7a7ab..54c6f5faa37c2 100644 --- a/clang/include/clang/Driver/Options.h +++ b/clang/include/clang/Driver/Options.h @@ -38,6 +38,7 @@ enum ClangFlags { DXCOption = (1 << 17), CLDXCOption = (1 << 18), Ignored = (1 << 19), + TargetSpecific = (1 << 20), }; enum ID { diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 93732f2b0768a..f3bfc26f271cc 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -75,6 +75,10 @@ def FlangOnlyOption : OptionFlag; // FC1Option - This option should be accepted by flang -fc1. def FC1Option : OptionFlag; +// This is a target-specific option for compilation. Using it on an unsupported +// target will lead to an err_drv_unsupported_opt_for_target error. +def TargetSpecific : OptionFlag; + // A short name to show in documentation. The name will be interpreted as rST. class DocName { string DocName = name; } @@ -89,6 +93,8 @@ class DocFlatten { bit DocFlatten = 1; } // GCC compatibility. class IgnoredGCCCompat : Flags<[HelpHidden]> {} +class TargetSpecific : Flags<[TargetSpecific]> {} + ///////// // Groups @@ -3123,7 +3129,7 @@ def mdefault_visibility_export_mapping_EQ : Joined<["-"], "mdefault-visibility-e NormalizedValuesScope<"LangOptions::DefaultVisiblityExportMapping">, NormalizedValues<["None", "Explicit", "All"]>, HelpText<"Mapping between default visibility and export">, - Group, Flags<[CC1Option]>, + Group, Flags<[CC1Option,TargetSpecific]>, MarshallingInfoEnum,"None">; defm new_infallible : BoolFOption<"new-infallible", LangOpts<"NewInfallible">, DefaultFalse, @@ -3507,7 +3513,7 @@ def mappletvsimulator_version_min_EQ : Joined<["-"], "mappletvsimulator-version- def mwatchos_version_min_EQ : Joined<["-"], "mwatchos-version-min=">, Group; def mwatchos_simulator_version_min_EQ : Joined<["-"], "mwatchos-simulator-version-min=">; def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=">, Alias; -def march_EQ : Joined<["-"], "march=">, Group, Flags<[CoreOption]>, +def march_EQ : Joined<["-"], "march=">, Group, Flags<[CoreOption,TargetSpecific]>, HelpText<"For a list of available architectures for the target use '-mcpu=help'">; def masm_EQ : Joined<["-"], "masm=">, Group, Flags<[NoXarchOption]>; def inline_asm_EQ : Joined<["-"], "inline-asm=">, Group, Flags<[CC1Option]>, @@ -3532,7 +3538,7 @@ def mthreads : Joined<["-"], "mthreads">, Group, Flags<[NoXarchOption]> def mguard_EQ : Joined<["-"], "mguard=">, Group, Flags<[NoXarchOption]>, HelpText<"Enable or disable Control Flow Guard checks and guard tables emission">, Values<"none,cf,cf-nochecks">; -def mcpu_EQ : Joined<["-"], "mcpu=">, Group, +def mcpu_EQ : Joined<["-"], "mcpu=">, Group, TargetSpecific, HelpText<"For a list of available CPUs for the target use '-mcpu=help'">; def mmcu_EQ : Joined<["-"], "mmcu=">, Group; def msim : Flag<["-"], "msim">, Group; @@ -3925,9 +3931,9 @@ def maix_struct_return : Flag<["-"], "maix-struct-return">, def msvr4_struct_return : Flag<["-"], "msvr4-struct-return">, Group, Flags<[CC1Option]>, HelpText<"Return small structs in registers (PPC32 only)">; -def mxcoff_roptr : Flag<["-"], "mxcoff-roptr">, Group, Flags<[CC1Option]>, +def mxcoff_roptr : Flag<["-"], "mxcoff-roptr">, Group, Flags<[CC1Option,TargetSpecific]>, HelpText<"Place constant objects with relocatable address values in the RO data section and add -bforceimprw to the linker flags (AIX only)">; -def mno_xcoff_roptr : Flag<["-"], "mno-xcoff-roptr">, Group; +def mno_xcoff_roptr : Flag<["-"], "mno-xcoff-roptr">, Group, TargetSpecific; def mvx : Flag<["-"], "mvx">, Group; def mno_vx : Flag<["-"], "mno-vx">, Group; @@ -3943,7 +3949,7 @@ def mxcoff_build_id_EQ : Joined<["-"], "mxcoff-build-id=">, Group, M HelpText<"On AIX, request creation of a build-id string, \"0xHEXSTRING\", in the string table of the loader section inside the linked binary">; def mignore_xcoff_visibility : Flag<["-"], "mignore-xcoff-visibility">, Group, HelpText<"Not emit the visibility attribute for asm in AIX OS or give all symbols 'unspecified' visibility in XCOFF object file">, - Flags<[CC1Option]>; + Flags<[CC1Option,TargetSpecific]>; defm backchain : BoolOption<"m", "backchain", CodeGenOpts<"Backchain">, DefaultFalse, PosFlag, diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 9f26ed676224b..ade59f45384fd 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4885,9 +4885,15 @@ void Driver::BuildJobs(Compilation &C) const { // In clang-cl, don't mention unknown arguments here since they have // already been warned about. - if (!IsCLMode() || !A->getOption().matches(options::OPT_UNKNOWN)) - Diag(clang::diag::warn_drv_unused_argument) - << A->getAsString(C.getArgs()); + if (!IsCLMode() || !A->getOption().matches(options::OPT_UNKNOWN)) { + if (A->getOption().hasFlag(options::TargetSpecific)) { + Diag(diag::err_drv_unsupported_opt_for_target) + << A->getSpelling() << getTargetTriple(); + } else { + Diag(clang::diag::warn_drv_unused_argument) + << A->getAsString(C.getArgs()); + } + } } } } diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp index 46ad8231764db..ad7f3edeb9384 100644 --- a/clang/lib/Driver/ToolChains/AIX.cpp +++ b/clang/lib/Driver/ToolChains/AIX.cpp @@ -409,6 +409,18 @@ void AIX::AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args, llvm_unreachable("Unexpected C++ library type; only libc++ is supported."); } +void AIX::addClangTargetOptions( + const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadingKind) const { + Args.AddLastArg(CC1Args, options::OPT_mignore_xcoff_visibility); + Args.AddLastArg(CC1Args, options::OPT_mdefault_visibility_export_mapping_EQ); + Args.addOptInFlag(CC1Args, options::OPT_mxcoff_roptr, options::OPT_mno_xcoff_roptr); + + if (Args.hasFlag(options::OPT_fxl_pragma_pack, + options::OPT_fno_xl_pragma_pack, true)) + CC1Args.push_back("-fxl-pragma-pack"); +} + void AIX::addProfileRTLibs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const { // Add linker option -u__llvm_profile_runtime to cause runtime diff --git a/clang/lib/Driver/ToolChains/AIX.h b/clang/lib/Driver/ToolChains/AIX.h index e03aebcc3e7f0..cc74e5ea85efc 100644 --- a/clang/lib/Driver/ToolChains/AIX.h +++ b/clang/lib/Driver/ToolChains/AIX.h @@ -80,6 +80,10 @@ class LLVM_LIBRARY_VISIBILITY AIX : public ToolChain { void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const override; + void addClangTargetOptions( + const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CC1Args, + Action::OffloadKind DeviceOffloadingKind) const override; + void addProfileRTLibs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const override; diff --git a/clang/lib/Driver/ToolChains/Arch/PPC.cpp b/clang/lib/Driver/ToolChains/Arch/PPC.cpp index befbd365fd03f..ab24d14992cd7 100644 --- a/clang/lib/Driver/ToolChains/Arch/PPC.cpp +++ b/clang/lib/Driver/ToolChains/Arch/PPC.cpp @@ -87,10 +87,6 @@ std::string ppc::getPPCTuneCPU(const ArgList &Args, const llvm::Triple &T) { /// Get the (LLVM) name of the PowerPC cpu we are targeting. std::string ppc::getPPCTargetCPU(const Driver &D, const ArgList &Args, const llvm::Triple &T) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { - D.Diag(diag::err_drv_unsupported_opt_for_target) - << A->getSpelling() << T.getTriple(); - } if (Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) return normalizeCPUName(A->getValue(), T); return getPPCGenericTargetCPU(T); diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp index e775599e8f5f7..11c9444fde2b1 100644 --- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp +++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp @@ -118,12 +118,6 @@ sparc::FloatABI sparc::getSparcFloatABI(const Driver &D, std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args, const llvm::Triple &Triple) { - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ)) { - D.Diag(diag::err_drv_unsupported_opt_for_target) - << A->getSpelling() << Triple.getTriple(); - return ""; - } - if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mcpu_EQ)) { StringRef CPUName = A->getValue(); if (CPUName == "native") { diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index e9d49fb556416..d5e8718641754 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5275,19 +5275,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, << A->getSpelling() << RawTriple.str(); } - if (Args.hasArg(options::OPT_mxcoff_roptr) || - Args.hasArg(options::OPT_mno_xcoff_roptr)) { - bool HasRoptr = Args.hasFlag(options::OPT_mxcoff_roptr, - options::OPT_mno_xcoff_roptr, false); - StringRef OptStr = HasRoptr ? "-mxcoff-roptr" : "-mno-xcoff-roptr"; - if (!Triple.isOSAIX()) - D.Diag(diag::err_drv_unsupported_opt_for_target) - << OptStr << RawTriple.str(); - - if (HasRoptr) - CmdArgs.push_back("-mxcoff-roptr"); - } - if (Arg *A = Args.getLastArg(options::OPT_Wframe_larger_than_EQ)) { StringRef V = A->getValue(), V1 = V; unsigned Size; @@ -6147,23 +6134,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, } } - if (const Arg *A = Args.getLastArg(options::OPT_mignore_xcoff_visibility)) { - if (Triple.isOSAIX()) - CmdArgs.push_back("-mignore-xcoff-visibility"); - else - D.Diag(diag::err_drv_unsupported_opt_for_target) - << A->getAsString(Args) << TripleStr; - } - - if (const Arg *A = - Args.getLastArg(options::OPT_mdefault_visibility_export_mapping_EQ)) { - if (Triple.isOSAIX()) - A->render(Args, CmdArgs); - else - D.Diag(diag::err_drv_unsupported_opt_for_target) - << A->getAsString(Args) << TripleStr; - } - if (Args.hasFlag(options::OPT_fvisibility_inlines_hidden, options::OPT_fno_visibility_inlines_hidden, false)) CmdArgs.push_back("-fvisibility-inlines-hidden"); @@ -6976,10 +6946,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.addOptInFlag(CmdArgs, options::OPT_fapple_pragma_pack, options::OPT_fno_apple_pragma_pack); - if (Args.hasFlag(options::OPT_fxl_pragma_pack, - options::OPT_fno_xl_pragma_pack, RawTriple.isOSAIX())) - CmdArgs.push_back("-fxl-pragma-pack"); - // Remarks can be enabled with any of the `-f.*optimization-record.*` flags. if (willEmitRemarks(Args) && checkRemarksOptions(D, Args, Triple)) renderRemarksOptions(Args, CmdArgs, Triple, Input, Output, JA); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 1ed93ba8b61b8..57bf345f1708e 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -337,6 +337,7 @@ void tools::AddTargetFeature(const ArgList &Args, /// Get the (LLVM) name of the AMDGPU gpu we are targeting. static std::string getAMDGPUTargetGPU(const llvm::Triple &T, const ArgList &Args) { + Arg *MArch = Args.getLastArg(options::OPT_march_EQ); if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) { auto GPUName = getProcessorFromTargetID(T, A->getValue()); return llvm::StringSwitch(GPUName) @@ -349,9 +350,8 @@ static std::string getAMDGPUTargetGPU(const llvm::Triple &T, .Case("aruba", "cayman") .Default(GPUName.str()); } - if (Arg *A = Args.getLastArg(options::OPT_march_EQ)) { - return getProcessorFromTargetID(T, A->getValue()).str(); - } + if (MArch) + return getProcessorFromTargetID(T, MArch->getValue()).str(); return ""; } diff --git a/clang/test/Driver/mdefault-visibility-export-mapping.c b/clang/test/Driver/mdefault-visibility-export-mapping.c index 506149b897e04..2f8f246373d57 100644 --- a/clang/test/Driver/mdefault-visibility-export-mapping.c +++ b/clang/test/Driver/mdefault-visibility-export-mapping.c @@ -4,4 +4,4 @@ // CHECK: "-mdefault-visibility-export-mapping=explicit" -// ERROR: unsupported option '-mdefault-visibility-export-mapping=explicit' for target 'powerpc-unknown-linux' +// ERROR: error: unsupported option '-mdefault-visibility-export-mapping=' for target 'powerpc-unknown-linux' From 54d45ddc89f1e2d3250b4d5093bea28e6e475cb7 Mon Sep 17 00:00:00 2001 From: Alex Brachet Date: Tue, 30 May 2023 18:21:36 +0000 Subject: [PATCH 127/704] [clang-tidy][docs] Fix link to libc style guide Differential Revision: https://reviews.llvm.org/D151502 --- clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h | 2 +- .../docs/clang-tidy/checks/llvmlibc/inline-function-decl.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h index 85d87a59e3733..662a592abd9be 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h +++ b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::llvm_libc { /// are tagged with the LIBC_INLINE macro. /// /// For more information about the LIBC_INLINE macro, see -/// https://libc.llvm.org/code_style.html. +/// https://libc.llvm.org/dev/code_style.html. /// /// For the user-facing documentation see: /// http://clang.llvm.org/extra/clang-tidy/checks/llvmlibc/inline-function-decl-check.html diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvmlibc/inline-function-decl.rst b/clang-tools-extra/docs/clang-tidy/checks/llvmlibc/inline-function-decl.rst index da60a1fcdb112..101217b64c828 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/llvmlibc/inline-function-decl.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/llvmlibc/inline-function-decl.rst @@ -5,4 +5,4 @@ llvmlibc-inline-function-decl Checks that all implicit and explicit inline functions in header files are tagged with the ``LIBC_INLINE`` macro. See the `libc style guide -`_ for more information about this macro. +`_ for more information about this macro. From 627d5e16127bd8034b893e66ab0c86eacf2d939a Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Fri, 26 May 2023 22:11:24 +0000 Subject: [PATCH 128/704] [BOLT][CMake] Use LLVM macros for install targets The existing BOLT install targets are broken on Windows becase they don't properly handle output extension. Rather than reimplementing this logic in BOLT, reuse the existing LLVM macros which already handle this aspect correctly. Differential Revision: https://reviews.llvm.org/D151595 --- bolt/tools/CMakeLists.txt | 10 ++++++++-- bolt/tools/driver/CMakeLists.txt | 7 ------- bolt/tools/merge-fdata/CMakeLists.txt | 5 ----- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/bolt/tools/CMakeLists.txt b/bolt/tools/CMakeLists.txt index 52050fb8b0056..e65a2763b8d0c 100644 --- a/bolt/tools/CMakeLists.txt +++ b/bolt/tools/CMakeLists.txt @@ -6,10 +6,16 @@ mark_as_advanced(BOLT_TOOLS_INSTALL_DIR) macro(add_bolt_tool name) llvm_add_tool(BOLT ${ARGV}) + install(TARGETS ${name} + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT ${name} + ) + add_llvm_install_targets(install-${name} DEPENDS ${name} COMPONENT ${name}) endmacro() -macro(add_bolt_tool_symlink name) - llvm_add_tool_symlink(BOLT ${ARGV}) +macro(add_bolt_tool_symlink name dest) + llvm_add_tool_symlink(BOLT ${name} ${dest}) + llvm_install_symlink(BOLT ${name} ${dest} ALWAYS_GENERATE) endmacro() add_subdirectory(driver) diff --git a/bolt/tools/driver/CMakeLists.txt b/bolt/tools/driver/CMakeLists.txt index 1c596194ffe80..29eb53064ee26 100644 --- a/bolt/tools/driver/CMakeLists.txt +++ b/bolt/tools/driver/CMakeLists.txt @@ -37,13 +37,6 @@ set(BOLT_DEPENDS ) add_custom_target(bolt DEPENDS ${BOLT_DEPENDS}) -install(PROGRAMS - ${CMAKE_BINARY_DIR}/bin/llvm-bolt - ${CMAKE_BINARY_DIR}/bin/perf2bolt - ${CMAKE_BINARY_DIR}/bin/llvm-boltdiff - DESTINATION ${CMAKE_INSTALL_BINDIR} - COMPONENT bolt - ) add_llvm_install_targets(install-bolt DEPENDS bolt COMPONENT bolt) set_target_properties(bolt PROPERTIES FOLDER "BOLT") set_target_properties(install-bolt PROPERTIES FOLDER "BOLT") diff --git a/bolt/tools/merge-fdata/CMakeLists.txt b/bolt/tools/merge-fdata/CMakeLists.txt index 9405acb6ab68d..ec843530f1ec9 100644 --- a/bolt/tools/merge-fdata/CMakeLists.txt +++ b/bolt/tools/merge-fdata/CMakeLists.txt @@ -11,11 +11,6 @@ add_bolt_tool(merge-fdata set_target_properties(merge-fdata PROPERTIES FOLDER "BOLT") add_dependencies(bolt merge-fdata) -install(PROGRAMS - ${CMAKE_BINARY_DIR}/bin/merge-fdata - DESTINATION ${CMAKE_INSTALL_BINDIR} - COMPONENT bolt - ) # Emit relocations for BOLT meta test (bolt/test/runtime/meta-merge-fdata.test) if (BOLT_INCLUDE_TESTS AND UNIX AND NOT APPLE) From 99a1aeefb3d6be2018b591ed8c184c6f75fac386 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Tue, 30 May 2023 19:28:14 +0000 Subject: [PATCH 129/704] Revert "[BOLT][CMake] Use LLVM macros for install targets" This reverts commit 627d5e16127bd8034b893e66ab0c86eacf2d939a. --- bolt/tools/CMakeLists.txt | 10 ++-------- bolt/tools/driver/CMakeLists.txt | 7 +++++++ bolt/tools/merge-fdata/CMakeLists.txt | 5 +++++ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/bolt/tools/CMakeLists.txt b/bolt/tools/CMakeLists.txt index e65a2763b8d0c..52050fb8b0056 100644 --- a/bolt/tools/CMakeLists.txt +++ b/bolt/tools/CMakeLists.txt @@ -6,16 +6,10 @@ mark_as_advanced(BOLT_TOOLS_INSTALL_DIR) macro(add_bolt_tool name) llvm_add_tool(BOLT ${ARGV}) - install(TARGETS ${name} - DESTINATION ${CMAKE_INSTALL_BINDIR} - COMPONENT ${name} - ) - add_llvm_install_targets(install-${name} DEPENDS ${name} COMPONENT ${name}) endmacro() -macro(add_bolt_tool_symlink name dest) - llvm_add_tool_symlink(BOLT ${name} ${dest}) - llvm_install_symlink(BOLT ${name} ${dest} ALWAYS_GENERATE) +macro(add_bolt_tool_symlink name) + llvm_add_tool_symlink(BOLT ${ARGV}) endmacro() add_subdirectory(driver) diff --git a/bolt/tools/driver/CMakeLists.txt b/bolt/tools/driver/CMakeLists.txt index 29eb53064ee26..1c596194ffe80 100644 --- a/bolt/tools/driver/CMakeLists.txt +++ b/bolt/tools/driver/CMakeLists.txt @@ -37,6 +37,13 @@ set(BOLT_DEPENDS ) add_custom_target(bolt DEPENDS ${BOLT_DEPENDS}) +install(PROGRAMS + ${CMAKE_BINARY_DIR}/bin/llvm-bolt + ${CMAKE_BINARY_DIR}/bin/perf2bolt + ${CMAKE_BINARY_DIR}/bin/llvm-boltdiff + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT bolt + ) add_llvm_install_targets(install-bolt DEPENDS bolt COMPONENT bolt) set_target_properties(bolt PROPERTIES FOLDER "BOLT") set_target_properties(install-bolt PROPERTIES FOLDER "BOLT") diff --git a/bolt/tools/merge-fdata/CMakeLists.txt b/bolt/tools/merge-fdata/CMakeLists.txt index ec843530f1ec9..9405acb6ab68d 100644 --- a/bolt/tools/merge-fdata/CMakeLists.txt +++ b/bolt/tools/merge-fdata/CMakeLists.txt @@ -11,6 +11,11 @@ add_bolt_tool(merge-fdata set_target_properties(merge-fdata PROPERTIES FOLDER "BOLT") add_dependencies(bolt merge-fdata) +install(PROGRAMS + ${CMAKE_BINARY_DIR}/bin/merge-fdata + DESTINATION ${CMAKE_INSTALL_BINDIR} + COMPONENT bolt + ) # Emit relocations for BOLT meta test (bolt/test/runtime/meta-merge-fdata.test) if (BOLT_INCLUDE_TESTS AND UNIX AND NOT APPLE) From d81ce04587c006b6731198956c522c93d0df1050 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 30 May 2023 12:45:21 -0700 Subject: [PATCH 130/704] [Driver] Report error for unsupported -mlarge-endian/-mlittle-endian --- clang/include/clang/Driver/Options.td | 4 ++-- clang/lib/Driver/Driver.cpp | 18 ++++++++---------- clang/test/Driver/endian.c | 5 +++++ clang/test/Driver/ppc-endian.c | 2 +- llvm/include/llvm/Option/ArgList.h | 6 ++++++ 5 files changed, 22 insertions(+), 13 deletions(-) create mode 100644 clang/test/Driver/endian.c diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index f3bfc26f271cc..f6240f86447cb 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3452,9 +3452,9 @@ def l : JoinedOrSeparate<["-"], "l">, Flags<[LinkerInput, RenderJoined]>, Group; def lazy__framework : Separate<["-"], "lazy_framework">, Flags<[LinkerInput]>; def lazy__library : Separate<["-"], "lazy_library">, Flags<[LinkerInput]>; -def mlittle_endian : Flag<["-"], "mlittle-endian">, Flags<[NoXarchOption]>; +def mlittle_endian : Flag<["-"], "mlittle-endian">, Flags<[NoXarchOption,TargetSpecific]>; def EL : Flag<["-"], "EL">, Alias; -def mbig_endian : Flag<["-"], "mbig-endian">, Flags<[NoXarchOption]>; +def mbig_endian : Flag<["-"], "mbig-endian">, Flags<[NoXarchOption,TargetSpecific]>; def EB : Flag<["-"], "EB">, Alias; def m16 : Flag<["-"], "m16">, Group, Flags<[NoXarchOption, CoreOption]>; def m32 : Flag<["-"], "m32">, Group, Flags<[NoXarchOption, CoreOption]>; diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index ade59f45384fd..fee14ba6faabe 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -559,16 +559,14 @@ static llvm::Triple computeTargetTriple(const Driver &D, // Handle pseudo-target flags '-mlittle-endian'/'-EL' and // '-mbig-endian'/'-EB'. - if (Arg *A = Args.getLastArg(options::OPT_mlittle_endian, - options::OPT_mbig_endian)) { - if (A->getOption().matches(options::OPT_mlittle_endian)) { - llvm::Triple LE = Target.getLittleEndianArchVariant(); - if (LE.getArch() != llvm::Triple::UnknownArch) - Target = std::move(LE); - } else { - llvm::Triple BE = Target.getBigEndianArchVariant(); - if (BE.getArch() != llvm::Triple::UnknownArch) - Target = std::move(BE); + if (Arg *A = Args.getLastArgNoClaim(options::OPT_mlittle_endian, + options::OPT_mbig_endian)) { + llvm::Triple T = A->getOption().matches(options::OPT_mlittle_endian) + ? Target.getLittleEndianArchVariant() + : Target.getBigEndianArchVariant(); + if (T.getArch() != llvm::Triple::UnknownArch) { + Target = std::move(T); + Args.claimAllArgs(options::OPT_mlittle_endian, options::OPT_mbig_endian); } } diff --git a/clang/test/Driver/endian.c b/clang/test/Driver/endian.c new file mode 100644 index 0000000000000..7fddbc93e9cd6 --- /dev/null +++ b/clang/test/Driver/endian.c @@ -0,0 +1,5 @@ +// RUN: %clang -### -c --target=x86_64 -mbig-endian -mlittle-endian %s 2>&1 | FileCheck /dev/null --implicit-check-not=error: +// RUN: %clang -### -c --target=x86_64 -mlittle-endian -mbig-endian %s 2>&1 | FileCheck %s --implicit-check-not=error: + +// CHECK: error: unsupported option '-mlittle-endian' for target 'x86_64' +// CHECK: error: unsupported option '-mbig-endian' for target 'x86_64' diff --git a/clang/test/Driver/ppc-endian.c b/clang/test/Driver/ppc-endian.c index 00c1c25bb358f..5854fa63fdc83 100644 --- a/clang/test/Driver/ppc-endian.c +++ b/clang/test/Driver/ppc-endian.c @@ -1,5 +1,5 @@ // RUN: %clang -target powerpc-unknown -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE32 %s -// RUN: %clang -target powerpc-unknown -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE32 %s +// RUN: %clang --target=powerpc-unknown -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE32 %s --implicit-check-not=error: // RUN: %clang -target powerpcle-unknown -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE32 %s // CHECK-BE32: "-cc1"{{.*}} "-triple" "powerpc-{{.*}}" diff --git a/llvm/include/llvm/Option/ArgList.h b/llvm/include/llvm/Option/ArgList.h index 6a07e1c657dca..310c8900af9ef 100644 --- a/llvm/include/llvm/Option/ArgList.h +++ b/llvm/include/llvm/Option/ArgList.h @@ -354,6 +354,12 @@ class ArgList { /// option id. void ClaimAllArgs(OptSpecifier Id0) const; + template + void claimAllArgs(OptSpecifiers... Ids) const { + for (Arg *A : filtered(Ids...)) + A->claim(); + } + /// ClaimAllArgs - Claim all arguments. /// void ClaimAllArgs() const; From 28b26b161c2f5f8aecf8fffa7220cacc990ba51c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Mon, 29 May 2023 20:08:56 +0000 Subject: [PATCH 131/704] [clang] [test] Narrow down an MSVC specific behaviour to only not covever MinGW This uses the same logic as in c2b256a990590dc8b69930259650cfeb085add03; we can't check defined(_MSC_VER) invoked as %clang_cc1, therefore check for !defined(__MINGW32__) instead. This fixes the same issue in a new testcase that was added after this issue was fixed last time in c2b256a990590dc8b69930259650cfeb085add03. Differential Revision: https://reviews.llvm.org/D151661 --- clang/test/CXX/drs/dr9xx.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/CXX/drs/dr9xx.cpp b/clang/test/CXX/drs/dr9xx.cpp index 4dfb98e5ec11e..fb13ef2967f88 100644 --- a/clang/test/CXX/drs/dr9xx.cpp +++ b/clang/test/CXX/drs/dr9xx.cpp @@ -92,7 +92,7 @@ namespace dr974 { // dr974: yes namespace dr977 { // dr977: yes enum E { e = E() }; -#ifndef _WIN32 +#if !defined(_WIN32) || defined(__MINGW32__) // expected-error@-2 {{invalid use of incomplete type 'E'}} // expected-note@-3 {{definition of 'dr977::E' is not complete until the closing '}'}} #endif From 02bf5e36f5bdd4e19b148f17bdb23465a5e070cc Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 30 May 2023 21:08:37 +0100 Subject: [PATCH 132/704] [ConstraintElim] Add additional tests for and implication handling. --- .../and-implied-by-operands.ll | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll index 2f684c555e69f..3b7674df05fd3 100644 --- a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll +++ b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll @@ -192,3 +192,74 @@ entry: %and = and i1 %c.1, %t.1 ret i1 %and } + +define i1 @and_select_not_used_for_branch(i32 %x, i32 %y,i32 %z) { +; CHECK-LABEL: @and_select_not_used_for_branch( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ne i32 [[X:%.*]], 0 +; CHECK-NEXT: [[C_2:%.*]] = icmp ne i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[C_3:%.*]] = icmp eq i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[C_2]], [[C_3]] +; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[EXIT:%.*]] +; CHECK: then: +; CHECK-NEXT: [[C_4:%.*]] = icmp eq i32 [[Z:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C_4]], i1 [[C_1]], i1 false +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ [[C_1]], [[ENTRY:%.*]] ], [ [[SEL]], [[THEN]] ] +; CHECK-NEXT: ret i1 [[RES]] +; +entry: + %c.1 = icmp ne i32 %x, 0 + %c.2 = icmp ne i32 %y, 0 + %c.3 = icmp eq i32 %x, 16 + %and = and i1 %c.2, %c.3 + br i1 %and, label %then, label %exit + +then: + %c.4 = icmp eq i32 %z, 0 + %sel = select i1 %c.4, i1 %c.1, i1 false + br label %exit + +exit: + %res = phi i1 [ %c.1, %entry ], [ %sel, %then ] + ret i1 %res +} + +define i1 @and_select_scope_limited(i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @and_select_scope_limited( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ne i32 [[X:%.*]], 0 +; CHECK-NEXT: [[C_2:%.*]] = icmp ne i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[C_3:%.*]] = icmp eq i32 [[X]], 16 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[C_2]], [[C_3]] +; CHECK-NEXT: br i1 [[AND]], label [[THEN:%.*]], label [[EXIT:%.*]] +; CHECK: then: +; CHECK-NEXT: [[C_4:%.*]] = icmp eq i32 [[Z:%.*]], 0 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C_4]], i1 [[C_1]], i1 false +; CHECK-NEXT: br i1 [[SEL]], label [[T_1:%.*]], label [[EXIT]] +; CHECK: t.1: +; CHECK-NEXT: ret i1 [[C_1]] +; CHECK: exit: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ [[C_1]], [[ENTRY:%.*]] ], [ [[SEL]], [[THEN]] ] +; CHECK-NEXT: ret i1 [[RES]] +; +entry: + %c.1 = icmp ne i32 %x, 0 + %c.2 = icmp ne i32 %y, 0 + %c.3 = icmp eq i32 %x, 16 + %and = and i1 %c.2, %c.3 + br i1 %and, label %then, label %exit + +then: + %c.4 = icmp eq i32 %z, 0 + %sel = select i1 %c.4, i1 %c.1, i1 false + br i1 %sel, label %t.1, label %exit + +t.1: + ret i1 %c.1 + +exit: + %res = phi i1 [ %c.1, %entry ], [ %sel, %then ] + ret i1 %res +} From 19ef02e3f4f82a439a94479589a9f1244d0b1b06 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 18 May 2023 14:16:24 -0700 Subject: [PATCH 133/704] [libc++][ci] Use ccache in the jobs that build Clang This is an attempt to reduce the time taken by the Bootstrapping build job and the Clang CI job that builds the compiler from scratch. Differential Revision: https://reviews.llvm.org/D150908 --- libcxx/utils/ci/buildkite-pipeline-clang.yml | 3 ++- libcxx/utils/ci/run-buildbot | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/libcxx/utils/ci/buildkite-pipeline-clang.yml b/libcxx/utils/ci/buildkite-pipeline-clang.yml index 40bb80aa8d30e..323f4cf80b890 100644 --- a/libcxx/utils/ci/buildkite-pipeline-clang.yml +++ b/libcxx/utils/ci/buildkite-pipeline-clang.yml @@ -23,8 +23,9 @@ steps: # We use Release here to avoid including debug information. Otherwise, the clang binary is very large, which # is problematic because we need to upload the artifacts for other jobs to use. This may seem like nothing, # but with the number of jobs we run daily, this can result in thousands of GB of network I/O. - - "cmake -S llvm -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DLLVM_ENABLE_PROJECTS=\"clang;compiler-rt\"" + - "cmake -S llvm -B build -G Ninja -DCMAKE_CXX_COMPILER_LAUNCHER="ccache" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DLLVM_ENABLE_PROJECTS=\"clang;compiler-rt\"" - "ninja -C build install-clang install-clang-resource-headers" + - "ccache -s" - "tar -cJvf install.tar.xz install/" - "buildkite-agent artifact upload --debug install.tar.xz" env: diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index 420dfde5645a6..650201251f8d1 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -340,6 +340,7 @@ bootstrapping-build) -S "${MONOREPO_ROOT}/llvm" \ -B "${BUILD_DIR}" \ -GNinja -DCMAKE_MAKE_PROGRAM="${NINJA}" \ + -DCMAKE_CXX_COMPILER_LAUNCHER="ccache" \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \ -DLLVM_ENABLE_PROJECTS="clang" \ From 8e0001eb95ce8654660510ddb06f5a8a3c5c6d68 Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Fri, 26 May 2023 18:26:49 -0700 Subject: [PATCH 134/704] [lldb][NFCI] Refactor Language::GetFormatterPrefixSuffix - Remove unused parameter `valobj` (I checked downstream, not even swift is using it). - Return a std::pair insted of having 2 out parameter strings. - Remove the use of ConstStrings. This change was primarily mechanical except in `ObjCLanguage::GetFormatterPrefixSuffix`. To keep this fast, we construct an llvm::StringMap> so that we can look things up quickly. There is some amount of cost to setting up the map the first time it is called, but subsequent lookups should be as fast as a hash + string comparison (the cost of looking up something in an llvm::StringMap). Differential Revision: https://reviews.llvm.org/D151603 --- lldb/include/lldb/Target/Language.h | 23 ++- lldb/source/Plugins/Language/ObjC/CF.cpp | 36 ++-- lldb/source/Plugins/Language/ObjC/Cocoa.cpp | 157 ++++++++---------- lldb/source/Plugins/Language/ObjC/NSArray.cpp | 18 +- .../Plugins/Language/ObjC/NSDictionary.cpp | 19 +-- lldb/source/Plugins/Language/ObjC/NSSet.cpp | 18 +- .../source/Plugins/Language/ObjC/NSString.cpp | 40 ++--- .../Plugins/Language/ObjC/ObjCLanguage.cpp | 93 +++-------- .../Plugins/Language/ObjC/ObjCLanguage.h | 5 +- lldb/source/Target/Language.cpp | 8 +- 10 files changed, 157 insertions(+), 260 deletions(-) diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h index bbb3e7c0cc8d1..a6b9ccaf31b3c 100644 --- a/lldb/include/lldb/Target/Language.h +++ b/lldb/include/lldb/Target/Language.h @@ -208,14 +208,21 @@ class Language : public PluginInterface { /// that the name actually belongs to this language. virtual bool SymbolNameFitsToLanguage(Mangled name) const { return false; } - // if an individual data formatter can apply to several types and cross a - // language boundary it makes sense for individual languages to want to - // customize the printing of values of that type by appending proper - // prefix/suffix information in language-specific ways - virtual bool GetFormatterPrefixSuffix(ValueObject &valobj, - ConstString type_hint, - std::string &prefix, - std::string &suffix); + /// An individual data formatter may apply to several types and cross language + /// boundaries. Each of those languages may want to customize the display of + /// values of said types by appending proper prefix/suffix information in + /// language-specific ways. This function returns that prefix and suffix. + /// + /// \param[in] type_hint + /// A StringRef used to determine what the prefix and suffix should be. It + /// is called a hint because some types may have multiple variants for which + /// the prefix and/or suffix may vary. + /// + /// \return + /// A std::pair, the first being the prefix and the + /// second being the suffix. They may be empty. + virtual std::pair + GetFormatterPrefixSuffix(llvm::StringRef type_hint); // When looking up functions, we take a user provided string which may be a // partial match to the full demangled name and compare it to the actual diff --git a/lldb/source/Plugins/Language/ObjC/CF.cpp b/lldb/source/Plugins/Language/ObjC/CF.cpp index fa2130e4b01e3..0926192a4f384 100644 --- a/lldb/source/Plugins/Language/ObjC/CF.cpp +++ b/lldb/source/Plugins/Language/ObjC/CF.cpp @@ -44,7 +44,7 @@ bool lldb_private::formatters::CFAbsoluteTimeSummaryProvider( bool lldb_private::formatters::CFBagSummaryProvider( ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) { - static ConstString g_TypeHint("CFBag"); + static constexpr llvm::StringLiteral g_TypeHint("CFBag"); ProcessSP process_sp = valobj.GetProcessSP(); if (!process_sp) @@ -92,17 +92,13 @@ bool lldb_private::formatters::CFBagSummaryProvider( } else return false; - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(options.GetLanguage())) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(options.GetLanguage())) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s\"%u value%s\"%s", prefix.c_str(), count, - (count == 1 ? "" : "s"), suffix.c_str()); + stream << prefix; + stream.Printf("\"%u value%s\"", count, (count == 1 ? "" : "s")); + stream << suffix; return true; } @@ -226,7 +222,7 @@ bool lldb_private::formatters::CFBitVectorSummaryProvider( bool lldb_private::formatters::CFBinaryHeapSummaryProvider( ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) { - static ConstString g_TypeHint("CFBinaryHeap"); + static constexpr llvm::StringLiteral g_TypeHint("CFBinaryHeap"); ProcessSP process_sp = valobj.GetProcessSP(); if (!process_sp) @@ -279,16 +275,12 @@ bool lldb_private::formatters::CFBinaryHeapSummaryProvider( } else return false; - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(options.GetLanguage())) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(options.GetLanguage())) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s\"%u item%s\"%s", prefix.c_str(), count, - (count == 1 ? "" : "s"), suffix.c_str()); + stream << prefix; + stream.Printf("\"%u item%s\"", count, (count == 1 ? "" : "s")); + stream << suffix; return true; } diff --git a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp index 46f82daaff8d5..243bec90f70c8 100644 --- a/lldb/source/Plugins/Language/ObjC/Cocoa.cpp +++ b/lldb/source/Plugins/Language/ObjC/Cocoa.cpp @@ -305,120 +305,97 @@ bool lldb_private::formatters::NSIndexSetSummaryProvider( static void NSNumber_FormatChar(ValueObject &valobj, Stream &stream, char value, lldb::LanguageType lang) { - static ConstString g_TypeHint("NSNumber:char"); - - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(lang)) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + static constexpr llvm::StringLiteral g_TypeHint("NSNumber:char"); + + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(lang)) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s%hhd%s", prefix.c_str(), value, suffix.c_str()); + stream << prefix; + stream.Printf("%hhd", value); + stream << suffix; } static void NSNumber_FormatShort(ValueObject &valobj, Stream &stream, short value, lldb::LanguageType lang) { - static ConstString g_TypeHint("NSNumber:short"); - - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(lang)) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + static constexpr llvm::StringLiteral g_TypeHint("NSNumber:short"); + + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(lang)) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s%hd%s", prefix.c_str(), value, suffix.c_str()); + stream << prefix; + stream.Printf("%hd", value); + stream << suffix; } static void NSNumber_FormatInt(ValueObject &valobj, Stream &stream, int value, lldb::LanguageType lang) { - static ConstString g_TypeHint("NSNumber:int"); - - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(lang)) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + static constexpr llvm::StringLiteral g_TypeHint("NSNumber:int"); + + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(lang)) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s%d%s", prefix.c_str(), value, suffix.c_str()); + stream << prefix; + stream.Printf("%d", value); + stream << suffix; } static void NSNumber_FormatLong(ValueObject &valobj, Stream &stream, int64_t value, lldb::LanguageType lang) { - static ConstString g_TypeHint("NSNumber:long"); - - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(lang)) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + static constexpr llvm::StringLiteral g_TypeHint("NSNumber:long"); + + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(lang)) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s%" PRId64 "%s", prefix.c_str(), value, suffix.c_str()); + stream << prefix; + stream.Printf("%" PRId64 "", value); + stream << suffix; } static void NSNumber_FormatInt128(ValueObject &valobj, Stream &stream, const llvm::APInt &value, lldb::LanguageType lang) { - static ConstString g_TypeHint("NSNumber:int128_t"); - - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(lang)) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + static constexpr llvm::StringLiteral g_TypeHint("NSNumber:int128_t"); + + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(lang)) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.PutCString(prefix.c_str()); + stream << prefix; const int radix = 10; const bool isSigned = true; std::string str = llvm::toString(value, radix, isSigned); stream.PutCString(str.c_str()); - stream.PutCString(suffix.c_str()); + stream << suffix; } static void NSNumber_FormatFloat(ValueObject &valobj, Stream &stream, float value, lldb::LanguageType lang) { - static ConstString g_TypeHint("NSNumber:float"); - - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(lang)) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + static constexpr llvm::StringLiteral g_TypeHint("NSNumber:float"); + + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(lang)) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s%f%s", prefix.c_str(), value, suffix.c_str()); + stream << prefix; + stream.Printf("%f", value); + stream << suffix; } static void NSNumber_FormatDouble(ValueObject &valobj, Stream &stream, double value, lldb::LanguageType lang) { - static ConstString g_TypeHint("NSNumber:double"); - - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(lang)) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + static constexpr llvm::StringLiteral g_TypeHint("NSNumber:double"); + + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(lang)) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s%g%s", prefix.c_str(), value, suffix.c_str()); + stream << prefix; + stream.Printf("%g", value); + stream << suffix; } bool lldb_private::formatters::NSNumberSummaryProvider( @@ -813,29 +790,27 @@ bool lldb_private::formatters::NSURLSummaryProvider( if (!NSStringSummaryProvider(*text, summary, options) || summary.Empty()) return false; - const char quote_char = '"'; - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(options.GetLanguage())) { - if (!language->GetFormatterPrefixSuffix(*text, ConstString("NSString"), - prefix, suffix)) { - prefix.clear(); - suffix.clear(); - } - } + static constexpr llvm::StringLiteral quote_char("\""); + static constexpr llvm::StringLiteral g_TypeHint("NSString"); + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(options.GetLanguage())) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); + // @"A" -> @"A llvm::StringRef summary_str = summary.GetString(); - bool back_consumed = summary_str.consume_back(quote_char + suffix); + bool back_consumed = + summary_str.consume_back(suffix) && summary_str.consume_back(quote_char); assert(back_consumed); UNUSED_IF_ASSERT_DISABLED(back_consumed); // @"B" -> B" llvm::StringRef base_summary_str = base_summary.GetString(); - bool front_consumed = base_summary_str.consume_front(prefix + quote_char); + bool front_consumed = base_summary_str.consume_front(prefix) && + base_summary_str.consume_front(quote_char); assert(front_consumed); UNUSED_IF_ASSERT_DISABLED(front_consumed); // @"A -- B" if (!summary_str.empty() && !base_summary_str.empty()) { - stream.Printf("%s -- %s", summary_str.str().c_str(), - base_summary_str.str().c_str()); + stream << summary_str << " -- " << base_summary_str; return true; } diff --git a/lldb/source/Plugins/Language/ObjC/NSArray.cpp b/lldb/source/Plugins/Language/ObjC/NSArray.cpp index 18bb2b8c4fdcf..bd356a61161a5 100644 --- a/lldb/source/Plugins/Language/ObjC/NSArray.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSArray.cpp @@ -334,7 +334,7 @@ class NSArray1SyntheticFrontEnd : public SyntheticChildrenFrontEnd { bool lldb_private::formatters::NSArraySummaryProvider( ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) { - static ConstString g_TypeHint("NSArray"); + static constexpr llvm::StringLiteral g_TypeHint("NSArray"); ProcessSP process_sp = valobj.GetProcessSP(); if (!process_sp) @@ -445,17 +445,13 @@ bool lldb_private::formatters::NSArraySummaryProvider( return false; } - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(options.GetLanguage())) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(options.GetLanguage())) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s%" PRIu64 " %s%s%s", prefix.c_str(), value, "element", - value == 1 ? "" : "s", suffix.c_str()); + stream << prefix; + stream.Printf("%" PRIu64 " %s%s", value, "element", value == 1 ? "" : "s"); + stream << suffix; return true; } diff --git a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp index 4bab8924f3a5e..702d196a7dda9 100644 --- a/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSDictionary.cpp @@ -409,7 +409,7 @@ namespace Foundation1437 { template bool lldb_private::formatters::NSDictionarySummaryProvider( ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) { - static ConstString g_TypeHint("NSDictionary"); + static constexpr llvm::StringLiteral g_TypeHint("NSDictionary"); ProcessSP process_sp = valobj.GetProcessSP(); if (!process_sp) return false; @@ -501,17 +501,14 @@ bool lldb_private::formatters::NSDictionarySummaryProvider( return false; } - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(options.GetLanguage())) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(options.GetLanguage())) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s%" PRIu64 " %s%s%s", prefix.c_str(), value, "key/value pair", - value == 1 ? "" : "s", suffix.c_str()); + stream << prefix; + stream.Printf("%" PRIu64 " %s%s", value, "key/value pair", + value == 1 ? "" : "s"); + stream << suffix; return true; } diff --git a/lldb/source/Plugins/Language/ObjC/NSSet.cpp b/lldb/source/Plugins/Language/ObjC/NSSet.cpp index fac8594d0c7d9..44097ee0c42b8 100644 --- a/lldb/source/Plugins/Language/ObjC/NSSet.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSSet.cpp @@ -249,7 +249,7 @@ class NSSetCodeRunningSyntheticFrontEnd : public SyntheticChildrenFrontEnd { template bool lldb_private::formatters::NSSetSummaryProvider( ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) { - static ConstString g_TypeHint("NSSet"); + static constexpr llvm::StringLiteral g_TypeHint("NSSet"); ProcessSP process_sp = valobj.GetProcessSP(); if (!process_sp) @@ -322,17 +322,13 @@ bool lldb_private::formatters::NSSetSummaryProvider( return false; } - std::string prefix, suffix; - if (Language *language = Language::FindPlugin(options.GetLanguage())) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(options.GetLanguage())) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); - stream.Printf("%s%" PRIu64 " %s%s%s", prefix.c_str(), value, "element", - value == 1 ? "" : "s", suffix.c_str()); + stream << prefix; + stream.Printf("%" PRIu64 " %s%s", value, "element", value == 1 ? "" : "s"); + stream << suffix; return true; } diff --git a/lldb/source/Plugins/Language/ObjC/NSString.cpp b/lldb/source/Plugins/Language/ObjC/NSString.cpp index 61705c866778c..0a30737d9723b 100644 --- a/lldb/source/Plugins/Language/ObjC/NSString.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSString.cpp @@ -33,7 +33,7 @@ NSString_Additionals::GetAdditionalSummaries() { bool lldb_private::formatters::NSStringSummaryProvider( ValueObject &valobj, Stream &stream, const TypeSummaryOptions &summary_options) { - static ConstString g_TypeHint("NSString"); + static constexpr llvm::StringLiteral g_TypeHint("NSString"); ProcessSP process_sp = valobj.GetProcessSP(); if (!process_sp) @@ -126,19 +126,13 @@ bool lldb_private::formatters::NSStringSummaryProvider( return true; } - std::string prefix, suffix; - if (Language *language = - Language::FindPlugin(summary_options.GetLanguage())) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(summary_options.GetLanguage())) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); StringPrinter::ReadStringAndDumpToStreamOptions options(valobj); - options.SetPrefixToken(prefix); - options.SetSuffixToken(suffix); + options.SetPrefixToken(prefix.str()); + options.SetSuffixToken(suffix.str()); if (is_mutable) { uint64_t location = 2 * ptr_size + valobj_addr; @@ -318,7 +312,7 @@ bool lldb_private::formatters::NSMutableAttributedStringSummaryProvider( bool lldb_private::formatters::NSTaggedString_SummaryProvider( ValueObject &valobj, ObjCLanguageRuntime::ClassDescriptorSP descriptor, Stream &stream, const TypeSummaryOptions &summary_options) { - static ConstString g_TypeHint("NSString"); + static constexpr llvm::StringLiteral g_TypeHint("NSString"); if (!descriptor) return false; @@ -336,23 +330,17 @@ bool lldb_private::formatters::NSTaggedString_SummaryProvider( if (len_bits > g_fiveBitMaxLen) return false; - std::string prefix, suffix; - if (Language *language = - Language::FindPlugin(summary_options.GetLanguage())) { - if (!language->GetFormatterPrefixSuffix(valobj, g_TypeHint, prefix, - suffix)) { - prefix.clear(); - suffix.clear(); - } - } + llvm::StringRef prefix, suffix; + if (Language *language = Language::FindPlugin(summary_options.GetLanguage())) + std::tie(prefix, suffix) = language->GetFormatterPrefixSuffix(g_TypeHint); // this is a fairly ugly trick - pretend that the numeric value is actually a // char* this works under a few assumptions: little endian architecture // sizeof(uint64_t) > g_MaxNonBitmaskedLen if (len_bits <= g_MaxNonBitmaskedLen) { - stream.Printf("%s", prefix.c_str()); + stream << prefix; stream.Printf("\"%s\"", (const char *)&data_bits); - stream.Printf("%s", suffix.c_str()); + stream << suffix; return true; } @@ -375,8 +363,8 @@ bool lldb_private::formatters::NSTaggedString_SummaryProvider( bytes.insert(bytes.begin(), sixBitToCharLookup[packed]); } - stream.Printf("%s", prefix.c_str()); + stream << prefix; stream.Printf("\"%s\"", &bytes[0]); - stream.Printf("%s", suffix.c_str()); + stream << suffix; return true; } diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp index 3a9e287158329..762f662d5773b 100644 --- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp +++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.cpp @@ -998,78 +998,27 @@ std::unique_ptr ObjCLanguage::GetTypeScavenger() { ObjCDebugInfoScavenger>()); } -bool ObjCLanguage::GetFormatterPrefixSuffix(ValueObject &valobj, - ConstString type_hint, - std::string &prefix, - std::string &suffix) { - static ConstString g_CFBag("CFBag"); - static ConstString g_CFBinaryHeap("CFBinaryHeap"); - - static ConstString g_NSNumberChar("NSNumber:char"); - static ConstString g_NSNumberShort("NSNumber:short"); - static ConstString g_NSNumberInt("NSNumber:int"); - static ConstString g_NSNumberLong("NSNumber:long"); - static ConstString g_NSNumberInt128("NSNumber:int128_t"); - static ConstString g_NSNumberFloat("NSNumber:float"); - static ConstString g_NSNumberDouble("NSNumber:double"); - - static ConstString g_NSData("NSData"); - static ConstString g_NSArray("NSArray"); - static ConstString g_NSString("NSString"); - static ConstString g_NSStringStar("NSString*"); - - if (type_hint.IsEmpty()) - return false; - - prefix.clear(); - suffix.clear(); - - if (type_hint == g_CFBag || type_hint == g_CFBinaryHeap) { - prefix = "@"; - return true; - } - - if (type_hint == g_NSNumberChar) { - prefix = "(char)"; - return true; - } - if (type_hint == g_NSNumberShort) { - prefix = "(short)"; - return true; - } - if (type_hint == g_NSNumberInt) { - prefix = "(int)"; - return true; - } - if (type_hint == g_NSNumberLong) { - prefix = "(long)"; - return true; - } - if (type_hint == g_NSNumberInt128) { - prefix = "(int128_t)"; - return true; - } - if (type_hint == g_NSNumberFloat) { - prefix = "(float)"; - return true; - } - if (type_hint == g_NSNumberDouble) { - prefix = "(double)"; - return true; - } - - if (type_hint == g_NSData || type_hint == g_NSArray) { - prefix = "@\""; - suffix = "\""; - return true; - } - - if (type_hint == g_NSString || type_hint == g_NSStringStar) { - prefix = "@"; - return true; - } - - return false; +std::pair +ObjCLanguage::GetFormatterPrefixSuffix(llvm::StringRef type_hint) { + static constexpr llvm::StringRef empty; + static const llvm::StringMap< + std::pair> + g_affix_map = { + {"CFBag", {"@", empty}}, + {"CFBinaryHeap", {"@", empty}}, + {"NSString", {"@", empty}}, + {"NSString*", {"@", empty}}, + {"NSNumber:char", {"(char)", empty}}, + {"NSNumber:short", {"(short)", empty}}, + {"NSNumber:int", {"(int)", empty}}, + {"NSNumber:long", {"(long)", empty}}, + {"NSNumber:int128_t", {"(int128_t)", empty}}, + {"NSNumber:float", {"(float)", empty}}, + {"NSNumber:double", {"(double)", empty}}, + {"NSData", {"@\"", "\""}}, + {"NSArray", {"@\"", "\""}}, + }; + return g_affix_map.lookup(type_hint); } bool ObjCLanguage::IsNilReference(ValueObject &valobj) { diff --git a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h index 3b74e79254936..bb8057846bb7c 100644 --- a/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h +++ b/lldb/source/Plugins/Language/ObjC/ObjCLanguage.h @@ -150,9 +150,8 @@ class ObjCLanguage : public Language { std::unique_ptr GetTypeScavenger() override; - bool GetFormatterPrefixSuffix(ValueObject &valobj, ConstString type_hint, - std::string &prefix, - std::string &suffix) override; + std::pair + GetFormatterPrefixSuffix(llvm::StringRef type_hint) override; bool IsNilReference(ValueObject &valobj) override; diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp index 4299c402c613a..a307cb345c005 100644 --- a/lldb/source/Target/Language.cpp +++ b/lldb/source/Target/Language.cpp @@ -452,11 +452,9 @@ bool Language::ImageListTypeScavenger::Find_Impl( return result; } -bool Language::GetFormatterPrefixSuffix(ValueObject &valobj, - ConstString type_hint, - std::string &prefix, - std::string &suffix) { - return false; +std::pair +Language::GetFormatterPrefixSuffix(llvm::StringRef type_hint) { + return std::pair(); } bool Language::DemangledNameContainsPath(llvm::StringRef path, From 692f3059fb95fe191033d2f710c51babc08b9425 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 30 May 2023 13:16:15 -0700 Subject: [PATCH 135/704] [Driver] Remove unused class ForceSuccessCommand The last use was removed by: commit 6625680a581c5e29c53d9f58d864cc6cd3cd05f6 Author: Hans Wennborg Date: Tue Feb 2 14:10:26 2021 +0100 Differential Revision: https://reviews.llvm.org/D151609 --- clang/include/clang/Driver/Job.h | 17 ----------------- clang/lib/Driver/Job.cpp | 24 ------------------------ 2 files changed, 41 deletions(-) diff --git a/clang/include/clang/Driver/Job.h b/clang/include/clang/Driver/Job.h index e866679dc1a91..df9449463c53b 100644 --- a/clang/include/clang/Driver/Job.h +++ b/clang/include/clang/Driver/Job.h @@ -258,23 +258,6 @@ class CC1Command : public Command { void setEnvironment(llvm::ArrayRef NewEnvironment) override; }; -/// Like Command, but always pretends that the wrapped command succeeded. -class ForceSuccessCommand : public Command { -public: - ForceSuccessCommand(const Action &Source_, const Tool &Creator_, - ResponseFileSupport ResponseSupport, - const char *Executable_, - const llvm::opt::ArgStringList &Arguments_, - ArrayRef Inputs, - ArrayRef Outputs = std::nullopt); - - void Print(llvm::raw_ostream &OS, const char *Terminator, bool Quote, - CrashReportInfo *CrashInfo = nullptr) const override; - - int Execute(ArrayRef> Redirects, std::string *ErrMsg, - bool *ExecutionFailed) const override; -}; - /// JobList - A sequence of jobs to perform. class JobList { public: diff --git a/clang/lib/Driver/Job.cpp b/clang/lib/Driver/Job.cpp index f85f55cd1ff54..573e91856d776 100644 --- a/clang/lib/Driver/Job.cpp +++ b/clang/lib/Driver/Job.cpp @@ -449,30 +449,6 @@ void CC1Command::setEnvironment(llvm::ArrayRef NewEnvironment) { "The CC1Command doesn't support changing the environment vars!"); } -ForceSuccessCommand::ForceSuccessCommand( - const Action &Source_, const Tool &Creator_, - ResponseFileSupport ResponseSupport, const char *Executable_, - const llvm::opt::ArgStringList &Arguments_, ArrayRef Inputs, - ArrayRef Outputs) - : Command(Source_, Creator_, ResponseSupport, Executable_, Arguments_, - Inputs, Outputs) {} - -void ForceSuccessCommand::Print(raw_ostream &OS, const char *Terminator, - bool Quote, CrashReportInfo *CrashInfo) const { - Command::Print(OS, "", Quote, CrashInfo); - OS << " || (exit 0)" << Terminator; -} - -int ForceSuccessCommand::Execute(ArrayRef> Redirects, - std::string *ErrMsg, - bool *ExecutionFailed) const { - int Status = Command::Execute(Redirects, ErrMsg, ExecutionFailed); - (void)Status; - if (ExecutionFailed) - *ExecutionFailed = false; - return 0; -} - void JobList::Print(raw_ostream &OS, const char *Terminator, bool Quote, CrashReportInfo *CrashInfo) const { for (const auto &Job : *this) From db7f639b900dca266ea9f47c934418af0a67122b Mon Sep 17 00:00:00 2001 From: Peiming Liu Date: Tue, 30 May 2023 19:00:53 +0000 Subject: [PATCH 136/704] [mlir][sparse] fix a crash when generating sparse convolution with nchw input Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D151744 --- .../Transforms/Sparsification.cpp | 54 +++--- .../CPU/sparse_conv_2d_nchw_fchw.mlir | 178 ++++++++++++++++++ 2 files changed, 205 insertions(+), 27 deletions(-) create mode 100644 mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp index 9c2465d25737d..1b711992a30d5 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -515,6 +515,15 @@ static bool topSortOptimal(CodegenEnv &env, return env.topSortSize() == numLoops; } +static void addIterOrdering(LoopId f, LoopId t, + std::vector> &adjM, + std::vector &inDegree) { + if (!adjM[f][t] && f != t) { + adjM[f][t] = true; + inDegree[t]++; + } +} + /// Helper method to add all constraints from the indices in one affine /// expression before all indices in the other affine expression. For /// example i0+i1 < i2+i3+1 yields i0> &adjM, // Recursion leaf. assert(fidx && tidx); const LoopId f = *fidx, t = *tidx; - if (!adjM[f][t]) { - adjM[f][t] = true; - inDegree[t]++; - } + addIterOrdering(f, t, adjM, inDegree); return; } // Picks an affine expression and expand (recurse into) it. @@ -693,6 +699,18 @@ static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t, const AffineExpr fa = map.getResult(toOrigDim(enc, lvl - 1)); const AffineExpr ta = map.getResult(toOrigDim(enc, lvl)); + if (auto fdim = fa.dyn_cast()) { + AffineDimCollector tCollector; + tCollector.walkPostOrder(ta); + + const LoopId f = env.makeLoopId(fdim.getPosition()); + for (auto td : tCollector.dims) { + const LoopId t = env.makeLoopId(td.getPosition()); + addIterOrdering(f, t, adjM, inDegree); + } + continue; + } + // This is a heuristic, we pick an abitrary reduction loop from lhs and // rhs and use them as d_x and d_y. finder.walkPostOrder(fa); @@ -704,10 +722,7 @@ static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t, const LoopId tldx = env.makeLoopId(texp.getPosition()); // d_x > d_y - if (!adjM[fldx][tldx]) { - adjM[fldx][tldx] = true; - inDegree[tldx]++; - } + addIterOrdering(fldx, tldx, adjM, inDegree); AffineDimCollector fCollector; fCollector.walkPostOrder(fa); @@ -717,21 +732,11 @@ static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t, // make sure dx and dy is the last; for (auto fd : fCollector.dims) { const LoopId f = env.makeLoopId(fd.getPosition()); - if (f == fldx) - continue; - if (!adjM[f][fldx]) { - adjM[f][fldx] = true; - inDegree[fldx]++; - } + addIterOrdering(f, fldx, adjM, inDegree); } for (auto td : tCollector.dims) { const LoopId t = env.makeLoopId(td.getPosition()); - if (t == tldx) - continue; - if (!adjM[t][tldx]) { - adjM[t][tldx] = true; - inDegree[tldx]++; - } + addIterOrdering(t, tldx, adjM, inDegree); } // Since we only support affine addition, the order between two dim // expression does not really matters. @@ -746,15 +751,11 @@ static void addSliceBasedConstraints(CodegenEnv &env, OpOperand &t, const LoopId f = env.makeLoopId(fd.getPosition()); if (f == fldx) // skip d_x continue; - for (auto td : tCollector.dims) { const LoopId t = env.makeLoopId(td.getPosition()); if (t == tldx) // skip d_y continue; - if (!adjM[f][t]) { - adjM[f][t] = true; - inDegree[t]++; - } + addIterOrdering(f, t, adjM, inDegree); } } } @@ -797,8 +798,7 @@ static bool computeIterationGraph(CodegenEnv &env, SortMask mask, isSingletonDLT(dltI)) { for (LoopId j = 0; j < numLoops; j++) if (isUndefDLT(env.dlt(tid, j))) { - adjM[i][j] = true; - inDegree[j]++; + addIterOrdering(i, j, adjM, inDegree); } } else { assert(isDenseDLT(dltI) || isUndefDLT(dltI)); diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir new file mode 100644 index 0000000000000..1d71990e55b32 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir @@ -0,0 +1,178 @@ +// DEFINE: %{option} = "enable-runtime-library=true enable-index-reduction=true" +// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option} +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: -e entry -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_c_runner_utils | \ +// DEFINE: FileCheck %s +// +// RUN: %{compile} | %{run} +// +// Do the same run, but now with direct IR generation. +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true" +// RUN: %{compile} | %{run} +// +// Do the same run, but now with direct IR generation and vectorization. +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true" +// RUN: %{compile} | %{run} + +// Do the same run, but now with direct IR generation and, if available, VLA +// vectorization. +// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA enable-index-reduction=true" +// REDEFINE: %{run} = %lli_host_or_aarch64_cmd \ +// REDEFINE: --entry-function=entry_lli \ +// REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \ +// REDEFINE: %VLA_ARCH_ATTR_OPTIONS \ +// REDEFINE: --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \ +// REDEFINE: FileCheck %s +// RUN: %{compile} | mlir-translate -mlir-to-llvmir | %{run} + + +// TODO: we can only support dense output for nchw input because 'c' is a reduction loop + + +#CCCD = #sparse_tensor.encoding<{ + lvlTypes = [ "dense", "dense", "dense", "compressed" ] +}> + + +#CCCC = #sparse_tensor.encoding<{ + lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ] +}> + +// FIXME: CDCD encoding crashes! + +// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f +func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor { + %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor + %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nchw_fchw(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nchw_fchw_CCCD(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nchw_fchw_CCCC(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %c8 = arith.constant 8 : index + %f10 = arith.constant 10.00000e+00 : f32 + %val = arith.constant 2.00000e+00 : f32 + %zero = arith.constant 0.00000e+00 : f32 + + %filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c3, %c3, %c3, %val) :(index, index, index, index, f32) -> (tensor) + %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c3, %c8, %c8, %val) : (index, index, index, index, f32) -> (tensor) + %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c0, %c0, %c3] : tensor + %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor) + %out2D_nhwc_CCCD = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor) + %out2D_nhwc_CCCC = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor) + + %in2D_nhwc_CCCD = sparse_tensor.convert %in2D_nhwc + : tensor to tensor + %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc + : tensor to tensor + + %dense_ret = call @conv_2d_nchw_fchw(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor, tensor, tensor) -> (tensor) + %CCCC_ret = call @conv_2d_nchw_fchw_CCCD(%in2D_nhwc_CCCD, %filter2D_nhwc, %out2D_nhwc_CCCD) : (tensor, tensor, tensor) -> (tensor) + %CDCD_ret = call @conv_2d_nchw_fchw_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc_CCCC) : (tensor, tensor, tensor) -> (tensor) + + + // CHECK: ( ( ( ( 108, 124, 124, 124, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) ) + %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x1x6x6xf32> + vector.print %dense_v : vector<3x1x6x6xf32> + + // CHECK: ( ( ( ( 108, 124, 124, 124, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) ) + %v1 = vector.transfer_read %CCCC_ret[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x1x6x6xf32> + vector.print %v1 : vector<3x1x6x6xf32> + + // CHECK: ( ( ( ( 108, 124, 124, 124, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) ) + %v2 = vector.transfer_read %CDCD_ret[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x1x6x6xf32> + vector.print %v2 : vector<3x1x6x6xf32> + + // Free the resources + bufferization.dealloc_tensor %in2D_nhwc : tensor + bufferization.dealloc_tensor %filter2D_nhwc : tensor + bufferization.dealloc_tensor %out2D_nhwc : tensor + bufferization.dealloc_tensor %out2D_nhwc_CCCD : tensor + bufferization.dealloc_tensor %out2D_nhwc_CCCC : tensor + + bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor + bufferization.dealloc_tensor %in2D_nhwc_CCCD : tensor + return +} From f46638b01d1bd66aa879188132e0d19a0a7f5928 Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Thu, 25 May 2023 18:19:46 -0700 Subject: [PATCH 137/704] [lldb][NFCI] Change type of SBDebugger::m_instance_name This doesn't need to be in the ConstString StringPool. There's little benefit to having these be unique, and we don't need fast comparisons on them. Differential Revision: https://reviews.llvm.org/D151524 --- lldb/include/lldb/Core/Debugger.h | 6 ++-- lldb/source/API/SBDebugger.cpp | 15 +++++---- lldb/source/Core/Debugger.cpp | 31 ++++++++++--------- .../Python/ScriptInterpreterPython.cpp | 2 +- 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index 54f7d5c0edb4a..b63597fc71b4c 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -116,7 +116,7 @@ class Debugger : public std::enable_shared_from_this, static lldb::DebuggerSP FindDebuggerWithID(lldb::user_id_t id); static lldb::DebuggerSP - FindDebuggerWithInstanceName(ConstString instance_name); + FindDebuggerWithInstanceName(llvm::StringRef instance_name); static size_t GetNumDebuggers(); @@ -359,7 +359,7 @@ class Debugger : public std::enable_shared_from_this, bool GetNotifyVoid() const; - ConstString GetInstanceName() { return m_instance_name; } + const std::string &GetInstanceName() { return m_instance_name; } bool LoadPlugin(const FileSpec &spec, Status &error); @@ -644,7 +644,7 @@ class Debugger : public std::enable_shared_from_this, llvm::StringMap> m_stream_handlers; std::shared_ptr m_callback_handler_sp; - ConstString m_instance_name; + const std::string m_instance_name; static LoadPluginCallbackType g_load_plugin_callback; typedef std::vector LoadedPluginsList; LoadedPluginsList m_loaded_plugins; diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index 9e9b01f830b59..9641e2f9c8a08 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -1326,7 +1326,10 @@ SBDebugger SBDebugger::FindDebuggerWithID(int id) { const char *SBDebugger::GetInstanceName() { LLDB_INSTRUMENT_VA(this); - return (m_opaque_sp ? m_opaque_sp->GetInstanceName().AsCString() : nullptr); + if (!m_opaque_sp) + return nullptr; + + return ConstString(m_opaque_sp->GetInstanceName()).AsCString(); } SBError SBDebugger::SetInternalVariable(const char *var_name, const char *value, @@ -1334,8 +1337,8 @@ SBError SBDebugger::SetInternalVariable(const char *var_name, const char *value, LLDB_INSTRUMENT_VA(var_name, value, debugger_instance_name); SBError sb_error; - DebuggerSP debugger_sp(Debugger::FindDebuggerWithInstanceName( - ConstString(debugger_instance_name))); + DebuggerSP debugger_sp( + Debugger::FindDebuggerWithInstanceName(debugger_instance_name)); Status error; if (debugger_sp) { ExecutionContext exe_ctx( @@ -1356,8 +1359,8 @@ SBDebugger::GetInternalVariableValue(const char *var_name, const char *debugger_instance_name) { LLDB_INSTRUMENT_VA(var_name, debugger_instance_name); - DebuggerSP debugger_sp(Debugger::FindDebuggerWithInstanceName( - ConstString(debugger_instance_name))); + DebuggerSP debugger_sp( + Debugger::FindDebuggerWithInstanceName(debugger_instance_name)); Status error; if (debugger_sp) { ExecutionContext exe_ctx( @@ -1487,7 +1490,7 @@ bool SBDebugger::GetDescription(SBStream &description) { Stream &strm = description.ref(); if (m_opaque_sp) { - const char *name = m_opaque_sp->GetInstanceName().AsCString(); + const char *name = m_opaque_sp->GetInstanceName().c_str(); user_id_t id = m_opaque_sp->GetID(); strm.Printf("Debugger (instance: \"%s\", id: %" PRIu64 ")", name, id); } else diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 1d92f2f52c2f7..ad177637f45b4 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -740,19 +740,20 @@ void Debugger::Destroy(DebuggerSP &debugger_sp) { } } -DebuggerSP Debugger::FindDebuggerWithInstanceName(ConstString instance_name) { - DebuggerSP debugger_sp; - if (g_debugger_list_ptr && g_debugger_list_mutex_ptr) { - std::lock_guard guard(*g_debugger_list_mutex_ptr); - DebuggerList::iterator pos, end = g_debugger_list_ptr->end(); - for (pos = g_debugger_list_ptr->begin(); pos != end; ++pos) { - if ((*pos)->m_instance_name == instance_name) { - debugger_sp = *pos; - break; - } - } +DebuggerSP +Debugger::FindDebuggerWithInstanceName(llvm::StringRef instance_name) { + if (!g_debugger_list_ptr || !g_debugger_list_mutex_ptr) + return DebuggerSP(); + + std::lock_guard guard(*g_debugger_list_mutex_ptr); + for (const DebuggerSP &debugger_sp : *g_debugger_list_ptr) { + if (!debugger_sp) + continue; + + if (llvm::StringRef(debugger_sp->GetInstanceName()) == instance_name) + return debugger_sp; } - return debugger_sp; + return DebuggerSP(); } TargetSP Debugger::FindTargetWithProcessID(lldb::pid_t pid) { @@ -801,13 +802,13 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton) m_source_manager_up(), m_source_file_cache(), m_command_interpreter_up( std::make_unique(*this, false)), - m_io_handler_stack(), m_instance_name(), m_loaded_plugins(), - m_event_handler_thread(), m_io_handler_thread(), + m_io_handler_stack(), + m_instance_name(llvm::formatv("debugger_{0}", GetID()).str()), + m_loaded_plugins(), m_event_handler_thread(), m_io_handler_thread(), m_sync_broadcaster(nullptr, "lldb.debugger.sync"), m_broadcaster(m_broadcaster_manager_sp, GetStaticBroadcasterClass().AsCString()), m_forward_listener_sp(), m_clear_once() { - m_instance_name.SetString(llvm::formatv("debugger_{0}", GetID()).str()); // Initialize the debugger properties as early as possible as other parts of // LLDB will start querying them during construction. m_collection_sp->Initialize(g_debugger_properties); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index ebc5990850942..902c7fad1105f 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -408,7 +408,7 @@ ScriptInterpreterPythonImpl::ScriptInterpreterPythonImpl(Debugger &debugger) m_session_dict(PyInitialValue::Invalid), m_sys_module_dict(PyInitialValue::Invalid), m_run_one_line_function(), m_run_one_line_str_global(), - m_dictionary_name(m_debugger.GetInstanceName().AsCString()), + m_dictionary_name(m_debugger.GetInstanceName()), m_active_io_handler(eIOHandlerNone), m_session_is_active(false), m_pty_secondary_is_open(false), m_valid_session(true), m_lock_count(0), m_command_thread_state(nullptr) { From af8e3861025f3c931cc67ced86d6bd8c939e6fc0 Mon Sep 17 00:00:00 2001 From: Nitin John Raj Date: Tue, 30 May 2023 12:54:24 -0700 Subject: [PATCH 138/704] [RISCV][GlobalISel] Add lowerFormalArguments for calling convention This patch adds an IncomingValueHandler and IncomingValueAssigner, and implements minimal support for lowering formal arguments according to the RISC-V calling convention. Simple non-aggregate integer and pointer types are supported. In the future, we must correctly handle byval and sret pointer arguments, and instances where the number of arguments exceeds the number of registers. Coauthored By: lewis-revill Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D74977 --- .../Target/RISCV/GISel/RISCVCallLowering.cpp | 96 +++++- .../RISCV/GlobalISel/irtranslator/args.ll | 297 ++++++++++++++++++ 2 files changed, 391 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/args.ll diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index 7b39b1c9444d4..cc6cf64a9a2db 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -79,6 +79,60 @@ struct RISCVOutgoingValueHandler : public CallLowering::OutgoingValueHandler { } }; +struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner { +private: + // The function used internally to assign args - we ignore the AssignFn stored + // by IncomingValueAssigner since RISC-V implements its CC using a custom + // function with a different signature. + RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn; + + // Whether this is assigning args from a return. + bool IsRet; + +public: + RISCVIncomingValueAssigner( + RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet) + : CallLowering::IncomingValueAssigner(nullptr), + RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet) {} + + bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + const CallLowering::ArgInfo &Info, ISD::ArgFlagsTy Flags, + CCState &State) override { + MachineFunction &MF = State.getMachineFunction(); + const DataLayout &DL = MF.getDataLayout(); + const RISCVSubtarget &Subtarget = MF.getSubtarget(); + + return RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT, + LocInfo, Flags, State, /*IsFixed=*/true, IsRet, + Info.Ty, *Subtarget.getTargetLowering(), + /*FirstMaskArgument=*/std::nullopt); + } +}; + +struct RISCVIncomingValueHandler : public CallLowering::IncomingValueHandler { + RISCVIncomingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI) + : IncomingValueHandler(B, MRI) {} + + Register getStackAddress(uint64_t MemSize, int64_t Offset, + MachinePointerInfo &MPO, + ISD::ArgFlagsTy Flags) override { + llvm_unreachable("not implemented"); + } + + void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy, + MachinePointerInfo &MPO, CCValAssign &VA) override { + llvm_unreachable("not implemented"); + } + + void assignValueToReg(Register ValVReg, Register PhysReg, + CCValAssign VA) override { + // Copy argument received in physical register to desired VReg. + MIRBuilder.getMBB().addLiveIn(PhysReg); + MIRBuilder.buildCopy(ValVReg, PhysReg); + } +}; + } // namespace RISCVCallLowering::RISCVCallLowering(const RISCVTargetLowering &TLI) @@ -131,11 +185,49 @@ bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef> VRegs, FunctionLoweringInfo &FLI) const { - + // Early exit if there are no arguments. if (F.arg_empty()) return true; - return false; + // TODO: Support vararg functions. + if (F.isVarArg()) + return false; + + // TODO: Support all argument types. + for (auto &Arg : F.args()) { + if (Arg.getType()->isIntegerTy()) + continue; + if (Arg.getType()->isPointerTy()) + continue; + return false; + } + + MachineFunction &MF = MIRBuilder.getMF(); + const DataLayout &DL = MF.getDataLayout(); + CallingConv::ID CC = F.getCallingConv(); + + SmallVector SplitArgInfos; + unsigned Index = 0; + for (auto &Arg : F.args()) { + // Construct the ArgInfo object from destination register and argument type. + ArgInfo AInfo(VRegs[Index], Arg.getType(), Index); + setArgFlags(AInfo, Index + AttributeList::FirstArgIndex, DL, F); + + // Handle any required merging from split value types from physical + // registers into the desired VReg. ArgInfo objects are constructed + // correspondingly and appended to SplitArgInfos. + splitToValueTypes(AInfo, SplitArgInfos, DL, CC); + + ++Index; + } + + RISCVIncomingValueAssigner Assigner( + CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, + /*IsRet=*/false); + RISCVIncomingValueHandler Handler(MIRBuilder, MF.getRegInfo()); + + return determineAndHandleAssignments(Handler, Assigner, SplitArgInfos, + MIRBuilder, CC, F.isVarArg()); } bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/args.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/args.ll new file mode 100644 index 0000000000000..fda08d91dc3a2 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/args.ll @@ -0,0 +1,297 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -mtriple=riscv32 -global-isel -stop-after=irtranslator -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV32I %s +; RUN: llc -mtriple=riscv64 -global-isel -stop-after=irtranslator -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV64I %s + +define void @test_args_i8(i8 %a) { + + ; RV32I-LABEL: name: test_args_i8 + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32I-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) + ; RV32I-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1 + ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s8) = G_ADD [[TRUNC]], [[C]] + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_i8 + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64) + ; RV64I-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1 + ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s8) = G_ADD [[TRUNC]], [[C]] + ; RV64I-NEXT: PseudoRET +entry: + %0 = add i8 %a, 1 + ret void +} + +define void @test_args_i16(i16 %a) { + + ; RV32I-LABEL: name: test_args_i16 + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32I-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; RV32I-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[C]] + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_i16 + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64) + ; RV64I-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[C]] + ; RV64I-NEXT: PseudoRET +entry: + %0 = add i16 %a, 1 + ret void +} + +define void @test_args_i32(i32 %a) { + + ; RV32I-LABEL: name: test_args_i32 + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C]] + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_i32 + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; RV64I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[C]] + ; RV64I-NEXT: PseudoRET +entry: + %0 = add i32 %a, 1 + ret void +} + +define void @test_args_i64(i64 %a) { + + ; RV32I-LABEL: name: test_args_i64 + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10, $x11 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; RV32I-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; RV32I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[MV]], [[C]] + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_i64 + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[C]] + ; RV64I-NEXT: PseudoRET +entry: + %0 = add i64 %a, 1 + ret void +} + +define void @test_args_i8_ptr(ptr %a) { + + ; RV32I-LABEL: name: test_args_i8_ptr + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV32I-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.a) + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_i8_ptr + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV64I-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.a) + ; RV64I-NEXT: PseudoRET +entry: + %0 = load i8, ptr %a + ret void +} + +define void @test_args_2xi8(i8 %a, i8 %b) { + + ; RV32I-LABEL: name: test_args_2xi8 + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10, $x11 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32I-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) + ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; RV32I-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) + ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s8) = G_ADD [[TRUNC]], [[TRUNC1]] + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_2xi8 + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10, $x11 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s64) + ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 + ; RV64I-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s64) + ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s8) = G_ADD [[TRUNC]], [[TRUNC1]] + ; RV64I-NEXT: PseudoRET +entry: + %0 = add i8 %a, %b + ret void +} + +define void @test_args_2xi16(i16 %a, i16 %b) { + + ; RV32I-LABEL: name: test_args_2xi16 + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10, $x11 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32I-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; RV32I-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC1]] + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_2xi16 + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10, $x11 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64) + ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 + ; RV64I-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64) + ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC1]] + ; RV64I-NEXT: PseudoRET +entry: + %0 = add i16 %a, %b + ret void +} + +define void @test_args_2xi32(i32 %a, i32 %b) { + + ; RV32I-LABEL: name: test_args_2xi32 + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10, $x11 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_2xi32 + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10, $x11 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 + ; RV64I-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]] + ; RV64I-NEXT: PseudoRET +entry: + %0 = add i32 %a, %b + ret void +} + +define void @test_args_2xi64(i64 %a, i64 %b) { + + ; RV32I-LABEL: name: test_args_2xi64 + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10, $x11, $x12, $x13 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 + ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 + ; RV32I-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; RV32I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12 + ; RV32I-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13 + ; RV32I-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[MV]], [[MV1]] + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_2xi64 + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10, $x11 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 + ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 + ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]] + ; RV64I-NEXT: PseudoRET +entry: + %0 = add i64 %a, %b + ret void +} + +define void @test_args_2xi8_ptr(ptr %a, ptr %b) { + + ; RV32I-LABEL: name: test_args_2xi8_ptr + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10, $x11 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; RV32I-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.a) + ; RV32I-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[COPY1]](p0) :: (load (s8) from %ir.b) + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_2xi8_ptr + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10, $x11 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11 + ; RV64I-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load (s8) from %ir.a) + ; RV64I-NEXT: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[COPY1]](p0) :: (load (s8) from %ir.b) + ; RV64I-NEXT: PseudoRET +entry: + %0 = load i8, ptr %a + %1 = load i8, ptr %b + ret void +} + +define void @test_args_ptr_byval(ptr byval(i8) %a) { + ; RV32I-LABEL: name: test_args_ptr_byval + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV32I-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (dereferenceable load (s8) from %ir.a) + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_ptr_byval + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV64I-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (dereferenceable load (s8) from %ir.a) + ; RV64I-NEXT: PseudoRET +entry: + %0 = load i8, ptr %a + ret void +} + +define void @test_args_ptr_sret(ptr sret(i8) %a) { + ; RV32I-LABEL: name: test_args_ptr_sret + ; RV32I: bb.1.entry: + ; RV32I-NEXT: liveins: $x10 + ; RV32I-NEXT: {{ $}} + ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV32I-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (dereferenceable load (s8) from %ir.a) + ; RV32I-NEXT: PseudoRET + ; RV64I-LABEL: name: test_args_ptr_sret + ; RV64I: bb.1.entry: + ; RV64I-NEXT: liveins: $x10 + ; RV64I-NEXT: {{ $}} + ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10 + ; RV64I-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (dereferenceable load (s8) from %ir.a) + ; RV64I-NEXT: PseudoRET +entry: + %0 = load i8, ptr %a + ret void +} From 9e8a412cb37d2a1201bd33878fce0993587ef335 Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Fri, 26 May 2023 16:36:13 -0700 Subject: [PATCH 139/704] [lldb][NFCI] Remove use of ConstString from StructuredDataDarwinLog static functions The strings "DarwinLog" and "log" probably do not need to be in the ConstString StringPool. We still create ConstStrings from them in some places (for now) but that's because we don't have an implicit constructor to convert a StringRef to a ConstString. Differential Revision: https://reviews.llvm.org/D151599 --- .../DarwinLog/StructuredDataDarwinLog.cpp | 40 +++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp index deebf0700f947..a9f88233d9463 100644 --- a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp +++ b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp @@ -162,13 +162,13 @@ const char *const s_filter_attributes[] = { // used to format message text }; -static ConstString GetDarwinLogTypeName() { - static const ConstString s_key_name("DarwinLog"); +static llvm::StringRef GetDarwinLogTypeName() { + static constexpr llvm::StringLiteral s_key_name("DarwinLog"); return s_key_name; } -static ConstString GetLogEventType() { - static const ConstString s_event_type("log"); +static llvm::StringRef GetLogEventType() { + static constexpr llvm::StringLiteral s_event_type("log"); return s_event_type; } @@ -799,8 +799,8 @@ class EnableCommand : public CommandObjectParsed { } // Get the plugin for the process. - auto plugin_sp = - process_sp->GetStructuredDataPlugin(GetDarwinLogTypeName()); + auto plugin_sp = process_sp->GetStructuredDataPlugin( + ConstString(GetDarwinLogTypeName())); if (!plugin_sp || (plugin_sp->GetPluginName() != StructuredDataDarwinLog::GetStaticPluginName())) { result.AppendError("failed to get StructuredDataPlugin for " @@ -822,8 +822,8 @@ class EnableCommand : public CommandObjectParsed { // Send configuration to the feature by way of the process. Construct the // options we will use. auto config_sp = m_options_sp->BuildConfigurationData(m_enable); - const Status error = - process_sp->ConfigureStructuredData(GetDarwinLogTypeName(), config_sp); + const Status error = process_sp->ConfigureStructuredData( + ConstString(GetDarwinLogTypeName()), config_sp); // Report results. if (!error.Success()) { @@ -871,8 +871,8 @@ class StatusCommand : public CommandObjectParsed { stream.PutCString("Enabled: not applicable " "(requires process)\n"); } else { - auto plugin_sp = - process_sp->GetStructuredDataPlugin(GetDarwinLogTypeName()); + auto plugin_sp = process_sp->GetStructuredDataPlugin( + ConstString(GetDarwinLogTypeName())); stream.Printf("Availability: %s\n", plugin_sp ? "available" : "unavailable"); llvm::StringRef plugin_name = StructuredDataDarwinLog::GetStaticPluginName(); @@ -1089,7 +1089,7 @@ void StructuredDataDarwinLog::HandleArrivalOfStructuredData( LLDB_LOGF(log, "StructuredDataDarwinLog::%s() StructuredData type " "expected to be %s but was %s, ignoring", - __FUNCTION__, GetDarwinLogTypeName().AsCString(), + __FUNCTION__, GetDarwinLogTypeName().str().c_str(), type_name.AsCString()); return; } @@ -1142,7 +1142,7 @@ Status StructuredDataDarwinLog::GetDescription( } // Validate this is really a message for our plugin. - ConstString type_name; + llvm::StringRef type_name; if (!dictionary->GetValueForKeyAsString("type", type_name)) { SetErrorWithJSON(error, "Structured data doesn't contain mandatory " "type field", @@ -1490,13 +1490,11 @@ bool StructuredDataDarwinLog::InitCompletionHookCallback( LLDB_LOGF(log, "StructuredDataDarwinLog::%s() call is for process uid %d", __FUNCTION__, process_sp->GetUniqueID()); - auto plugin_sp = process_sp->GetStructuredDataPlugin(GetDarwinLogTypeName()); + auto plugin_sp = + process_sp->GetStructuredDataPlugin(ConstString(GetDarwinLogTypeName())); if (!plugin_sp) { - LLDB_LOGF(log, - "StructuredDataDarwinLog::%s() warning: no plugin for " - "feature %s in process uid %u", - __FUNCTION__, GetDarwinLogTypeName().AsCString(), - process_sp->GetUniqueID()); + LLDB_LOG(log, "warning: no plugin for feature {0} in process uid {1}", + GetDarwinLogTypeName(), process_sp->GetUniqueID()); return false; } @@ -1736,7 +1734,7 @@ StructuredDataDarwinLog::DumpHeader(Stream &output_stream, size_t StructuredDataDarwinLog::HandleDisplayOfEvent( const StructuredData::Dictionary &event, Stream &stream) { // Check the type of the event. - ConstString event_type; + llvm::StringRef event_type; if (!event.GetValueForKeyAsString("type", event_type)) { // Hmm, we expected to get events that describe what they are. Continue // anyway. @@ -1836,8 +1834,8 @@ void StructuredDataDarwinLog::EnableNow() { // We can run it directly. // Send configuration to the feature by way of the process. - const Status error = - process_sp->ConfigureStructuredData(GetDarwinLogTypeName(), config_sp); + const Status error = process_sp->ConfigureStructuredData( + ConstString(GetDarwinLogTypeName()), config_sp); // Report results. if (!error.Success()) { From 769d282d7292d14591a721ee967962736160095e Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 25 May 2023 12:38:45 -0700 Subject: [PATCH 140/704] [clang][lex] NFCI: Use FileEntryRef in ModuleMap::{load,lookup}ModuleMap() This patch changes the return/argument types of `ModuleMap::{load,lookup}ModuleMap()` from `const FileEntry *` to `FileEntryRef` in order to remove uses of the deprecated `DirectoryEntry::getName()`. Reviewed By: bnbarham Differential Revision: https://reviews.llvm.org/D127647 --- clang/include/clang/Lex/HeaderSearch.h | 13 +++++----- clang/lib/Frontend/FrontendAction.cpp | 9 +++---- clang/lib/Lex/HeaderSearch.cpp | 33 ++++++++++++-------------- clang/lib/Lex/ModuleMap.cpp | 6 ++--- 4 files changed, 29 insertions(+), 32 deletions(-) diff --git a/clang/include/clang/Lex/HeaderSearch.h b/clang/include/clang/Lex/HeaderSearch.h index 49fb99c1483ce..2a4e046be46fd 100644 --- a/clang/include/clang/Lex/HeaderSearch.h +++ b/clang/include/clang/Lex/HeaderSearch.h @@ -637,9 +637,9 @@ class HeaderSearch { bool AllowExtraModuleMapSearch = false); /// Try to find a module map file in the given directory, returning - /// \c nullptr if none is found. - const FileEntry *lookupModuleMapFile(const DirectoryEntry *Dir, - bool IsFramework); + /// \c nullopt if none is found. + OptionalFileEntryRef lookupModuleMapFile(const DirectoryEntry *Dir, + bool IsFramework); /// Determine whether there is a module map that may map the header /// with the given file name to a (sub)module. @@ -686,8 +686,8 @@ class HeaderSearch { /// used to resolve paths within the module (this is required when /// building the module from preprocessed source). /// \returns true if an error occurred, false otherwise. - bool loadModuleMapFile(const FileEntry *File, bool IsSystem, - FileID ID = FileID(), unsigned *Offset = nullptr, + bool loadModuleMapFile(FileEntryRef File, bool IsSystem, FileID ID = FileID(), + unsigned *Offset = nullptr, StringRef OriginalModuleMapFile = StringRef()); /// Collect the set of all known, top-level modules. @@ -904,8 +904,7 @@ class HeaderSearch { LMM_InvalidModuleMap }; - LoadModuleMapResult loadModuleMapFileImpl(const FileEntry *File, - bool IsSystem, + LoadModuleMapResult loadModuleMapFileImpl(FileEntryRef File, bool IsSystem, DirectoryEntryRef Dir, FileID ID = FileID(), unsigned *Offset = nullptr); diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index bd6d1b03e8f30..7ef480b3889c4 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -447,7 +447,8 @@ static bool loadModuleMapForModuleBuild(CompilerInstance &CI, bool IsSystem, // Map the current input to a file. FileID ModuleMapID = SrcMgr.getMainFileID(); - const FileEntry *ModuleMap = SrcMgr.getFileEntryForID(ModuleMapID); + OptionalFileEntryRef ModuleMap = SrcMgr.getFileEntryRefForID(ModuleMapID); + assert(ModuleMap && "MainFileID without FileEntry"); // If the module map is preprocessed, handle the initial line marker; // line directives are not part of the module map syntax in general. @@ -460,7 +461,7 @@ static bool loadModuleMapForModuleBuild(CompilerInstance &CI, bool IsSystem, } // Load the module map file. - if (HS.loadModuleMapFile(ModuleMap, IsSystem, ModuleMapID, &Offset, + if (HS.loadModuleMapFile(*ModuleMap, IsSystem, ModuleMapID, &Offset, PresumedModuleMapFile)) return true; @@ -469,7 +470,7 @@ static bool loadModuleMapForModuleBuild(CompilerInstance &CI, bool IsSystem, // Infer framework module if possible. if (HS.getModuleMap().canInferFrameworkModule(ModuleMap->getDir())) { - SmallString<128> InferredFrameworkPath = ModuleMap->getDir()->getName(); + SmallString<128> InferredFrameworkPath = ModuleMap->getDir().getName(); llvm::sys::path::append(InferredFrameworkPath, CI.getLangOpts().ModuleName + ".framework"); if (auto Dir = CI.getFileManager().getDirectory(InferredFrameworkPath)) @@ -910,7 +911,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI, // If we were asked to load any module map files, do so now. for (const auto &Filename : CI.getFrontendOpts().ModuleMapFiles) { - if (auto File = CI.getFileManager().getFile(Filename)) + if (auto File = CI.getFileManager().getOptionalFileRef(Filename)) CI.getPreprocessor().getHeaderSearchInfo().loadModuleMapFile( *File, /*IsSystem*/false); else diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp index d09d3ae12f581..7df1ca16f67ce 100644 --- a/clang/lib/Lex/HeaderSearch.cpp +++ b/clang/lib/Lex/HeaderSearch.cpp @@ -1654,10 +1654,10 @@ bool HeaderSearch::findUsableModuleForFrameworkHeader( return true; } -static const FileEntry *getPrivateModuleMap(const FileEntry *File, +static const FileEntry *getPrivateModuleMap(FileEntryRef File, FileManager &FileMgr) { - StringRef Filename = llvm::sys::path::filename(File->getName()); - SmallString<128> PrivateFilename(File->getDir()->getName()); + StringRef Filename = llvm::sys::path::filename(File.getName()); + SmallString<128> PrivateFilename(File.getDir().getName()); if (Filename == "module.map") llvm::sys::path::append(PrivateFilename, "module_private.map"); else if (Filename == "module.modulemap") @@ -1669,7 +1669,7 @@ static const FileEntry *getPrivateModuleMap(const FileEntry *File, return nullptr; } -bool HeaderSearch::loadModuleMapFile(const FileEntry *File, bool IsSystem, +bool HeaderSearch::loadModuleMapFile(FileEntryRef File, bool IsSystem, FileID ID, unsigned *Offset, StringRef OriginalModuleMapFile) { // Find the directory for the module. For frameworks, that may require going @@ -1688,9 +1688,7 @@ bool HeaderSearch::loadModuleMapFile(const FileEntry *File, bool IsSystem, Dir = FakeFile.getDir(); } } else { - // TODO: Replace with `Dir = File.getDir()` when `File` is switched to - // `FileEntryRef`. - Dir = FileMgr.getOptionalDirectoryRef(File->getDir()->getName()); + Dir = File.getDir(); } assert(Dir && "parent must exist"); @@ -1719,11 +1717,9 @@ bool HeaderSearch::loadModuleMapFile(const FileEntry *File, bool IsSystem, } HeaderSearch::LoadModuleMapResult -HeaderSearch::loadModuleMapFileImpl(const FileEntry *File, bool IsSystem, +HeaderSearch::loadModuleMapFileImpl(FileEntryRef File, bool IsSystem, DirectoryEntryRef Dir, FileID ID, unsigned *Offset) { - assert(File && "expected FileEntry"); - // Check whether we've already loaded this module map, and mark it as being // loaded in case we recursively try to load it from itself. auto AddResult = LoadedModuleMaps.insert(std::make_pair(File, true)); @@ -1747,23 +1743,23 @@ HeaderSearch::loadModuleMapFileImpl(const FileEntry *File, bool IsSystem, return LMM_NewlyLoaded; } -const FileEntry * +OptionalFileEntryRef HeaderSearch::lookupModuleMapFile(const DirectoryEntry *Dir, bool IsFramework) { if (!HSOpts->ImplicitModuleMaps) - return nullptr; + return std::nullopt; // For frameworks, the preferred spelling is Modules/module.modulemap, but // module.map at the framework root is also accepted. SmallString<128> ModuleMapFileName(Dir->getName()); if (IsFramework) llvm::sys::path::append(ModuleMapFileName, "Modules"); llvm::sys::path::append(ModuleMapFileName, "module.modulemap"); - if (auto F = FileMgr.getFile(ModuleMapFileName)) + if (auto F = FileMgr.getOptionalFileRef(ModuleMapFileName)) return *F; // Continue to allow module.map ModuleMapFileName = Dir->getName(); llvm::sys::path::append(ModuleMapFileName, "module.map"); - if (auto F = FileMgr.getFile(ModuleMapFileName)) + if (auto F = FileMgr.getOptionalFileRef(ModuleMapFileName)) return *F; // For frameworks, allow to have a private module map with a preferred @@ -1772,10 +1768,10 @@ HeaderSearch::lookupModuleMapFile(const DirectoryEntry *Dir, bool IsFramework) { ModuleMapFileName = Dir->getName(); llvm::sys::path::append(ModuleMapFileName, "Modules", "module.private.modulemap"); - if (auto F = FileMgr.getFile(ModuleMapFileName)) + if (auto F = FileMgr.getOptionalFileRef(ModuleMapFileName)) return *F; } - return nullptr; + return std::nullopt; } Module *HeaderSearch::loadFrameworkModule(StringRef Name, DirectoryEntryRef Dir, @@ -1818,9 +1814,10 @@ HeaderSearch::loadModuleMapFile(DirectoryEntryRef Dir, bool IsSystem, if (KnownDir != DirectoryHasModuleMap.end()) return KnownDir->second ? LMM_AlreadyLoaded : LMM_InvalidModuleMap; - if (const FileEntry *ModuleMapFile = lookupModuleMapFile(Dir, IsFramework)) { + if (OptionalFileEntryRef ModuleMapFile = + lookupModuleMapFile(Dir, IsFramework)) { LoadModuleMapResult Result = - loadModuleMapFileImpl(ModuleMapFile, IsSystem, Dir); + loadModuleMapFileImpl(*ModuleMapFile, IsSystem, Dir); // Add Dir explicitly in case ModuleMapFile is in a subdirectory. // E.g. Foo.framework/Modules/module.modulemap // ^Dir ^ModuleMapFile diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index 0db7ebff29174..adaad64d47ef7 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -1019,9 +1019,9 @@ Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir, // We haven't looked here before. Load a module map, if there is // one. bool IsFrameworkDir = Parent.endswith(".framework"); - if (const FileEntry *ModMapFile = - HeaderInfo.lookupModuleMapFile(*ParentDir, IsFrameworkDir)) { - parseModuleMapFile(ModMapFile, Attrs.IsSystem, *ParentDir); + if (OptionalFileEntryRef ModMapFile = + HeaderInfo.lookupModuleMapFile(*ParentDir, IsFrameworkDir)) { + parseModuleMapFile(*ModMapFile, Attrs.IsSystem, *ParentDir); inferred = InferredDirectories.find(*ParentDir); } From 95279d7670cd54a50cf72d1fbc99701ef1faa72b Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 25 May 2023 14:02:32 -0700 Subject: [PATCH 141/704] [clang][lex] NFCI: Use DirectoryEntryRef in ModuleMap::inferFrameworkModule() This patch changes the argument type of `ModuleMap::inferFrameworkModule()` from `const DirectoryEntry *` to `DirectoryEntryRef` in order to remove the deprecated uses of `DirectoryEntry::getName()`. Depends on D127647. Reviewed By: bnbarham Differential Revision: https://reviews.llvm.org/D127648 --- clang/include/clang/Lex/ModuleMap.h | 8 ++++---- clang/lib/Frontend/FrontendAction.cpp | 3 ++- clang/lib/Lex/ModuleMap.cpp | 14 ++++++-------- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/clang/include/clang/Lex/ModuleMap.h b/clang/include/clang/Lex/ModuleMap.h index 92697fe7deaa1..d291afa70619e 100644 --- a/clang/include/clang/Lex/ModuleMap.h +++ b/clang/include/clang/Lex/ModuleMap.h @@ -380,8 +380,8 @@ class ModuleMap { return static_cast(findHeaderInUmbrellaDirs(File, IntermediateDirs)); } - Module *inferFrameworkModule(const DirectoryEntry *FrameworkDir, - Attributes Attrs, Module *Parent); + Module *inferFrameworkModule(DirectoryEntryRef FrameworkDir, Attributes Attrs, + Module *Parent); public: /// Construct a new module map. @@ -591,8 +591,8 @@ class ModuleMap { /// Infer the contents of a framework module map from the given /// framework directory. - Module *inferFrameworkModule(const DirectoryEntry *FrameworkDir, - bool IsSystem, Module *Parent); + Module *inferFrameworkModule(DirectoryEntryRef FrameworkDir, bool IsSystem, + Module *Parent); /// Create a new top-level module that is shadowed by /// \p ShadowingModule. diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index 7ef480b3889c4..77d03c4062223 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -473,7 +473,8 @@ static bool loadModuleMapForModuleBuild(CompilerInstance &CI, bool IsSystem, SmallString<128> InferredFrameworkPath = ModuleMap->getDir().getName(); llvm::sys::path::append(InferredFrameworkPath, CI.getLangOpts().ModuleName + ".framework"); - if (auto Dir = CI.getFileManager().getDirectory(InferredFrameworkPath)) + if (auto Dir = + CI.getFileManager().getOptionalDirectoryRef(InferredFrameworkPath)) (void)HS.getModuleMap().inferFrameworkModule(*Dir, IsSystem, nullptr); } diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index adaad64d47ef7..efe2df0323ecc 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -972,14 +972,14 @@ static void inferFrameworkLink(Module *Mod) { /*IsFramework=*/true)); } -Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir, +Module *ModuleMap::inferFrameworkModule(DirectoryEntryRef FrameworkDir, bool IsSystem, Module *Parent) { Attributes Attrs; Attrs.IsSystem = IsSystem; return inferFrameworkModule(FrameworkDir, Attrs, Parent); } -Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir, +Module *ModuleMap::inferFrameworkModule(DirectoryEntryRef FrameworkDir, Attributes Attrs, Module *Parent) { // Note: as an egregious but useful hack we use the real path here, because // we might be looking at an embedded framework that symlinks out to a @@ -1010,7 +1010,7 @@ Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir, if (llvm::sys::path::has_parent_path(FrameworkDirName)) { // Figure out the parent path. StringRef Parent = llvm::sys::path::parent_path(FrameworkDirName); - if (auto ParentDir = FileMgr.getDirectory(Parent)) { + if (auto ParentDir = FileMgr.getOptionalDirectoryRef(Parent)) { // Check whether we have already looked into the parent directory // for a module map. llvm::DenseMap::const_iterator @@ -1057,7 +1057,7 @@ Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir, } // Look for an umbrella header. - SmallString<128> UmbrellaName = StringRef(FrameworkDir->getName()); + SmallString<128> UmbrellaName = FrameworkDir.getName(); llvm::sys::path::append(UmbrellaName, "Headers", ModuleName + ".h"); auto UmbrellaHeader = FileMgr.getOptionalFileRef(UmbrellaName); @@ -1103,8 +1103,7 @@ Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir, // Look for subframeworks. std::error_code EC; - SmallString<128> SubframeworksDirName - = StringRef(FrameworkDir->getName()); + SmallString<128> SubframeworksDirName = FrameworkDir.getName(); llvm::sys::path::append(SubframeworksDirName, "Frameworks"); llvm::sys::path::native(SubframeworksDirName); llvm::vfs::FileSystem &FS = FileMgr.getVirtualFileSystem(); @@ -1115,8 +1114,7 @@ Module *ModuleMap::inferFrameworkModule(const DirectoryEntry *FrameworkDir, if (!StringRef(Dir->path()).endswith(".framework")) continue; - if (auto SubframeworkDir = - FileMgr.getDirectory(Dir->path())) { + if (auto SubframeworkDir = FileMgr.getOptionalDirectoryRef(Dir->path())) { // Note: as an egregious but useful hack, we use the real path here and // check whether it is actually a subdirectory of the parent directory. // This will not be the case if the 'subframework' is actually a symlink From d574e918dba31e670a87c46e7de281819b3c0ea9 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 25 May 2023 14:13:15 -0700 Subject: [PATCH 142/704] [clang][lex] NFCI: Use DirectoryEntryRef in ModuleMap::parseModuleMapFile() This patch changes the argument type of `ModuleMap::parseModuleMapFile()` from `const DirectoryEntry *` to `DirectoryEntryRef` in order to remove the deprecated uses of `DirectoryEntry::getName()`. Depends on D127648. Reviewed By: bnbarham Differential Revision: https://reviews.llvm.org/D127651 --- .../modularize/ModularizeUtilities.cpp | 25 +++++++++---------- clang/include/clang/Lex/ModuleMap.h | 4 +-- clang/lib/Lex/ModuleMap.cpp | 14 +++++------ 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp index a7cadf818664a..5b09c916606d9 100644 --- a/clang-tools-extra/modularize/ModularizeUtilities.cpp +++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp @@ -258,34 +258,33 @@ std::error_code ModularizeUtilities::loadProblemHeaderList( std::error_code ModularizeUtilities::loadModuleMap( llvm::StringRef InputPath) { // Get file entry for module.modulemap file. - auto ModuleMapEntryOrErr = - SourceMgr->getFileManager().getFile(InputPath); + auto ModuleMapEntryOrErr = SourceMgr->getFileManager().getFileRef(InputPath); // return error if not found. if (!ModuleMapEntryOrErr) { llvm::errs() << "error: File \"" << InputPath << "\" not found.\n"; - return ModuleMapEntryOrErr.getError(); + return errorToErrorCode(ModuleMapEntryOrErr.takeError()); } - const FileEntry *ModuleMapEntry = *ModuleMapEntryOrErr; + FileEntryRef ModuleMapEntry = *ModuleMapEntryOrErr; // Because the module map parser uses a ForwardingDiagnosticConsumer, // which doesn't forward the BeginSourceFile call, we do it explicitly here. DC.BeginSourceFile(*LangOpts, nullptr); // Figure out the home directory for the module map file. - const DirectoryEntry *Dir = ModuleMapEntry->getDir(); - StringRef DirName(Dir->getName()); + DirectoryEntryRef Dir = ModuleMapEntry.getDir(); + StringRef DirName(Dir.getName()); if (llvm::sys::path::filename(DirName) == "Modules") { DirName = llvm::sys::path::parent_path(DirName); if (DirName.endswith(".framework")) { - if (auto DirEntry = FileMgr->getDirectory(DirName)) - Dir = *DirEntry; - else - Dir = nullptr; + auto FrameworkDirOrErr = FileMgr->getDirectoryRef(DirName); + if (!FrameworkDirOrErr) { + // This can happen if there's a race between the above check and the + // removal of the directory. + return errorToErrorCode(FrameworkDirOrErr.takeError()); + } + Dir = *FrameworkDirOrErr; } - // FIXME: This assert can fail if there's a race between the above check - // and the removal of the directory. - assert(Dir && "parent must exist"); } std::unique_ptr ModMap; diff --git a/clang/include/clang/Lex/ModuleMap.h b/clang/include/clang/Lex/ModuleMap.h index d291afa70619e..79cf0afc46271 100644 --- a/clang/include/clang/Lex/ModuleMap.h +++ b/clang/include/clang/Lex/ModuleMap.h @@ -729,8 +729,8 @@ class ModuleMap { /// /// \returns true if an error occurred, false otherwise. bool parseModuleMapFile(const FileEntry *File, bool IsSystem, - const DirectoryEntry *HomeDir, - FileID ID = FileID(), unsigned *Offset = nullptr, + DirectoryEntryRef HomeDir, FileID ID = FileID(), + unsigned *Offset = nullptr, SourceLocation ExternModuleLoc = SourceLocation()); /// Dump the contents of the module map, for debugging purposes. diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index efe2df0323ecc..833287c665167 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -1518,7 +1518,7 @@ namespace clang { /// The directory that file names in this module map file should /// be resolved relative to. - const DirectoryEntry *Directory; + DirectoryEntryRef Directory; /// Whether this module map is in a system header directory. bool IsSystem; @@ -1584,7 +1584,7 @@ namespace clang { explicit ModuleMapParser(Lexer &L, SourceManager &SourceMgr, const TargetInfo *Target, DiagnosticsEngine &Diags, ModuleMap &Map, const FileEntry *ModuleMapFile, - const DirectoryEntry *Directory, bool IsSystem) + DirectoryEntryRef Directory, bool IsSystem) : L(L), SourceMgr(SourceMgr), Target(Target), Diags(Diags), Map(Map), ModuleMapFile(ModuleMapFile), Directory(Directory), IsSystem(IsSystem) { @@ -2254,16 +2254,16 @@ void ModuleMapParser::parseExternModuleDecl() { StringRef FileNameRef = FileName; SmallString<128> ModuleMapFileName; if (llvm::sys::path::is_relative(FileNameRef)) { - ModuleMapFileName += Directory->getName(); + ModuleMapFileName += Directory.getName(); llvm::sys::path::append(ModuleMapFileName, FileName); FileNameRef = ModuleMapFileName; } - if (auto File = SourceMgr.getFileManager().getFile(FileNameRef)) + if (auto File = SourceMgr.getFileManager().getOptionalFileRef(FileNameRef)) Map.parseModuleMapFile( *File, IsSystem, Map.HeaderInfo.getHeaderSearchOpts().ModuleMapFileHomeIsCwd ? Directory - : (*File)->getDir(), + : File->getDir(), FileID(), nullptr, ExternLoc); } @@ -2518,7 +2518,7 @@ void ModuleMapParser::parseUmbrellaDirDecl(SourceLocation UmbrellaLoc) { Dir = SourceMgr.getFileManager().getOptionalDirectoryRef(DirName); } else { SmallString<128> PathName; - PathName = Directory->getName(); + PathName = Directory.getName(); llvm::sys::path::append(PathName, DirName); Dir = SourceMgr.getFileManager().getOptionalDirectoryRef(PathName); } @@ -3080,7 +3080,7 @@ bool ModuleMapParser::parseModuleMapFile() { } bool ModuleMap::parseModuleMapFile(const FileEntry *File, bool IsSystem, - const DirectoryEntry *Dir, FileID ID, + DirectoryEntryRef Dir, FileID ID, unsigned *Offset, SourceLocation ExternModuleLoc) { assert(Target && "Missing target information"); From 32d6f3cfd90b90321e84477b2bd353afbf9876ea Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 25 May 2023 14:37:20 -0700 Subject: [PATCH 143/704] [clang] NFCI: Use DirectoryEntryRef in Module::Directory This patch changes the type of `Module::Directory` from `const DirectoryEntry *` to (essentially) `Optional` in order to remove uses of the deprecated `DirectoryEntry::getName()`. Depends on D127651. Reviewed By: bnbarham Differential Revision: https://reviews.llvm.org/D127654 --- clang/include/clang/Basic/Module.h | 2 +- clang/lib/Lex/ModuleMap.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h index 128e2adc189b2..3ecab422bc42c 100644 --- a/clang/include/clang/Basic/Module.h +++ b/clang/include/clang/Basic/Module.h @@ -149,7 +149,7 @@ class alignas(8) Module { /// The build directory of this module. This is the directory in /// which the module is notionally built, and relative to which its headers /// are found. - const DirectoryEntry *Directory = nullptr; + OptionalDirectoryEntryRefDegradesToDirectoryEntryPtr Directory; /// The presumed file name for the module map defining this module. /// Only non-empty when building from preprocessed source. diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index 833287c665167..6808fdfdaf4f9 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -181,7 +181,7 @@ OptionalFileEntryRef ModuleMap::findHeader( Module *M, const Module::UnresolvedHeaderDirective &Header, SmallVectorImpl &RelativePathName, bool &NeedsFramework) { // Search for the header file within the module's home directory. - auto *Directory = M->Directory; + auto Directory = M->Directory; SmallString<128> FullPathName(Directory->getName()); auto GetFile = [&](StringRef Filename) -> OptionalFileEntryRef { From b1e5b28b8c4d8af294906ee27a62b28079d3f337 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 25 May 2023 14:45:31 -0700 Subject: [PATCH 144/704] [clang][lex] NFCI: Use FileEntryRef in Sema::CodeCompleteIncludedFile This patch replaces some calls to the deprecated `DirectoryEntry::getName()` with calls to `DirectoryEntryRef::getName()` in SemaCodeComplete.cpp. Depends on D127654. Reviewed By: bnbarham Differential Revision: https://reviews.llvm.org/D127658 --- clang/lib/Sema/SemaCodeComplete.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 22d8b5eb44e8d..8c166570265c1 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -10055,12 +10055,12 @@ void Sema::CodeCompleteIncludedFile(llvm::StringRef Dir, bool Angled) { // header maps are not (currently) enumerable. break; case DirectoryLookup::LT_NormalDir: - AddFilesFromIncludeDir(IncludeDir.getDir()->getName(), IsSystem, + AddFilesFromIncludeDir(IncludeDir.getDirRef()->getName(), IsSystem, DirectoryLookup::LT_NormalDir); break; case DirectoryLookup::LT_Framework: - AddFilesFromIncludeDir(IncludeDir.getFrameworkDir()->getName(), IsSystem, - DirectoryLookup::LT_Framework); + AddFilesFromIncludeDir(IncludeDir.getFrameworkDirRef()->getName(), + IsSystem, DirectoryLookup::LT_Framework); break; } }; @@ -10072,9 +10072,8 @@ void Sema::CodeCompleteIncludedFile(llvm::StringRef Dir, bool Angled) { using llvm::make_range; if (!Angled) { // The current directory is on the include path for "quoted" includes. - const FileEntry *CurFile = PP.getCurrentFileLexer()->getFileEntry(); - if (CurFile && CurFile->getDir()) - AddFilesFromIncludeDir(CurFile->getDir()->getName(), false, + if (auto CurFile = PP.getCurrentFileLexer()->getFileEntry()) + AddFilesFromIncludeDir(CurFile->getDir().getName(), false, DirectoryLookup::LT_NormalDir); for (const auto &D : make_range(S.quoted_dir_begin(), S.quoted_dir_end())) AddFilesFromDirLookup(D, false); From e19f352beee72d8fc30d9a5366eadd2372719fa3 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 25 May 2023 14:47:57 -0700 Subject: [PATCH 145/704] [clang][lex] NFCI: Use DirectoryEntryRef in Preprocessor::MainFileDir This patch changes the type of `Preprocessor::MainFileDir` from `const DirectoryEntry *` to `Optional` in order to remove some calls to the deprecated `DirectoryEntry::getName()`. Depends on D127658. Reviewed By: bnbarham Differential Revision: https://reviews.llvm.org/D127660 --- clang/include/clang/Lex/Preprocessor.h | 6 ++---- clang/lib/Frontend/FrontendAction.cpp | 2 +- clang/lib/Lex/PPDirectives.cpp | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 8bdaf25e9b870..8fbc002059a86 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -625,7 +625,7 @@ class Preprocessor { /// The directory that the main file should be considered to occupy, /// if it does not correspond to a real file (as happens when building a /// module). - const DirectoryEntry *MainFileDir = nullptr; + OptionalDirectoryEntryRef MainFileDir; /// The number of bytes that we will initially skip when entering the /// main file, along with a flag that indicates whether skipping this number @@ -2012,9 +2012,7 @@ class Preprocessor { /// Set the directory in which the main file should be considered /// to have been found, if it is not a real file. - void setMainFileDir(const DirectoryEntry *Dir) { - MainFileDir = Dir; - } + void setMainFileDir(DirectoryEntryRef Dir) { MainFileDir = Dir; } /// Instruct the preprocessor to skip part of the main source file. /// diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index 77d03c4062223..a8dcdb44b08df 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -511,7 +511,7 @@ static Module *prepareToBuildModule(CompilerInstance &CI, // Inform the preprocessor that includes from within the input buffer should // be resolved relative to the build directory of the module map file. - CI.getPreprocessor().setMainFileDir(M->Directory); + CI.getPreprocessor().setMainFileDir(*M->Directory); // If the module was inferred from a different module map (via an expanded // umbrella module definition), track that fact. diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 2066c61748efa..1a5398e3adea6 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -956,7 +956,7 @@ OptionalFileEntryRef Preprocessor::LookupFile( // map file. if (!FileEnt) { if (FID == SourceMgr.getMainFileID() && MainFileDir) { - Includers.push_back(std::make_pair(nullptr, MainFileDir)); + Includers.push_back(std::make_pair(nullptr, *MainFileDir)); BuildSystemModule = getCurrentModule()->IsSystem; } else if ((FileEnt = SourceMgr.getFileEntryForID(SourceMgr.getMainFileID()))) From e348dbc4b2766f17c251b6c305a3b34fbdb9be96 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Tue, 30 May 2023 14:08:04 -0700 Subject: [PATCH 146/704] [lldb] Fix build after Clang API change This fixes breakage introduced by 769d282d. --- .../Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp index 98c1b1a73b782..7895fc6d59ef7 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp @@ -333,7 +333,7 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module, HS.getFileMgr().getDirectory(module.search_path.GetStringRef()); if (!dir) return error(); - auto *file = HS.lookupModuleMapFile(*dir, is_framework); + auto file = HS.lookupModuleMapFile(*dir, is_framework); if (!file) return error(); if (!HS.loadModuleMapFile(file, is_system)) From 43bec3376c07c9940361adef29bb2e4fb1f526a7 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 30 May 2023 14:12:20 -0700 Subject: [PATCH 147/704] Remove HAVE_STRERROR Most systems support strerror_r. For the remaining systems (e.g. MSVC) strerror_s and strerror can be used as fallbacks. We don't have a supported operating system/compiler that doesn't provide `strerror`. Close https://github.com/llvm/llvm-project/issues/62804 https://github.com/flang-compiler/f18/pull/1068 added a fallback when strerror is unavailable, but I think the code path is dead. Reviewed By: serge-sans-paille, vzakhari Differential Revision: https://reviews.llvm.org/D151718 --- flang/runtime/CMakeLists.txt | 5 ----- flang/runtime/io-error.cpp | 5 +---- llvm/cmake/config-ix.cmake | 1 - llvm/include/llvm/Config/config.h.cmake | 3 --- llvm/lib/Support/Errno.cpp | 8 +------- llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 1 - .../llvm/include/llvm/Config/config.h | 3 --- utils/bazel/llvm_configs/config.h.cmake | 3 --- 8 files changed, 2 insertions(+), 27 deletions(-) diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt index 85845c02b3054..e69a0119b8351 100644 --- a/flang/runtime/CMakeLists.txt +++ b/flang/runtime/CMakeLists.txt @@ -50,7 +50,6 @@ endif() include(CheckCXXSymbolExists) include(CheckCXXSourceCompiles) -check_cxx_symbol_exists(strerror string.h HAVE_STRERROR) check_cxx_symbol_exists(strerror_r string.h HAVE_STRERROR_R) # Can't use symbol exists here as the function is overloaded in C++ check_cxx_source_compiles( @@ -69,10 +68,6 @@ else() set(NO_LTO_FLAGS "") endif() -if (NOT (HAVE_STRERROR OR HAVE_STRERROR_R OR HAVE_DECL_STRERROR_S)) - message(FATAL_ERROR "None of strerror, strerror_r, strerror_s found.") -endif() - configure_file(config.h.cmake config.h) # include_directories is used here instead of target_include_directories # because add_flang_library creates multiple objects (STATIC/SHARED, OBJECT) diff --git a/flang/runtime/io-error.cpp b/flang/runtime/io-error.cpp index 56e4b24cbe062..c8f6675c60a6c 100644 --- a/flang/runtime/io-error.cpp +++ b/flang/runtime/io-error.cpp @@ -122,14 +122,11 @@ bool IoErrorHandler::GetIoMsg(char *buffer, std::size_t bufferLength) { #endif #elif HAVE_DECL_STRERROR_S // "Windows Secure API" ok = ::strerror_s(buffer, bufferLength, ioStat_) == 0; -#elif HAVE_STRERROR +#else // Copy the thread un-safe result of strerror into // the buffer as fast as possible to minimize impact // of collision of strerror in multiple threads. msg = strerror(ioStat_); -#else - // Strange that this system doesn't even have strerror - return false; #endif if (msg) { ToFortranDefaultCharacter(buffer, bufferLength, msg); diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 2c815430432bf..1c11056593e9a 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -299,7 +299,6 @@ check_symbol_exists(getrlimit "sys/types.h;sys/time.h;sys/resource.h" HAVE_GETRL check_symbol_exists(posix_spawn spawn.h HAVE_POSIX_SPAWN) check_symbol_exists(pread unistd.h HAVE_PREAD) check_symbol_exists(sbrk unistd.h HAVE_SBRK) -check_symbol_exists(strerror string.h HAVE_STRERROR) check_symbol_exists(strerror_r string.h HAVE_STRERROR_R) check_symbol_exists(strerror_s string.h HAVE_DECL_STRERROR_S) check_symbol_exists(setenv stdlib.h HAVE_SETENV) diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake index 29ac536b4c31b..216919078c7aa 100644 --- a/llvm/include/llvm/Config/config.h.cmake +++ b/llvm/include/llvm/Config/config.h.cmake @@ -176,9 +176,6 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SIGNAL_H ${HAVE_SIGNAL_H} -/* Define to 1 if you have the `strerror' function. */ -#cmakedefine HAVE_STRERROR ${HAVE_STRERROR} - /* Define to 1 if you have the `strerror_r' function. */ #cmakedefine HAVE_STRERROR_R ${HAVE_STRERROR_R} diff --git a/llvm/lib/Support/Errno.cpp b/llvm/lib/Support/Errno.cpp index 7f665be8db6c1..60a7e536b6c5c 100644 --- a/llvm/lib/Support/Errno.cpp +++ b/llvm/lib/Support/Errno.cpp @@ -55,17 +55,11 @@ std::string StrError(int errnum) { #elif HAVE_DECL_STRERROR_S // "Windows Secure API" strerror_s(buffer, MaxErrStrLen - 1, errnum); str = buffer; -#elif defined(HAVE_STRERROR) +#else // Copy the thread un-safe result of strerror into // the buffer as fast as possible to minimize impact // of collision of strerror in multiple threads. str = strerror(errnum); -#else - // Strange that this system doesn't even have strerror - // but, oh well, just use a generic message - raw_string_ostream stream(str); - stream << "Error #" << errnum; - stream.flush(); #endif return str; } diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index 07fdcfa908677..bda9bdb4edbdf 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -99,7 +99,6 @@ write_cmake_config("config") { "HAVE_LIBPSAPI=", "HAVE_MALLCTL=", "HAVE_SIGNAL_H=1", - "HAVE_STRERROR=1", "HAVE_SYS_STAT_H=1", "HAVE_SYS_TYPES_H=1", "HAVE_VALGRIND_VALGRIND_H=", diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h index 6a68ac040bb87..b4fb2373d571f 100644 --- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h +++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h @@ -189,9 +189,6 @@ /* Define to 1 if you have the header file. */ #define HAVE_SIGNAL_H 1 -/* Define to 1 if you have the `strerror' function. */ -#define HAVE_STRERROR 1 - /* Define to 1 if you have the `strerror_r' function. */ /* HAVE_STRERROR_R defined in Bazel */ diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake index 29ac536b4c31b..216919078c7aa 100644 --- a/utils/bazel/llvm_configs/config.h.cmake +++ b/utils/bazel/llvm_configs/config.h.cmake @@ -176,9 +176,6 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_SIGNAL_H ${HAVE_SIGNAL_H} -/* Define to 1 if you have the `strerror' function. */ -#cmakedefine HAVE_STRERROR ${HAVE_STRERROR} - /* Define to 1 if you have the `strerror_r' function. */ #cmakedefine HAVE_STRERROR_R ${HAVE_STRERROR_R} From 087b67cc067710aeb2d660aed4b686df8754d418 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Tue, 30 May 2023 21:13:49 +0000 Subject: [PATCH 148/704] [AMDGPU][LoadStoreVectorizer] Pre-commit test for addrspace 7 crash Differential Revision: https://reviews.llvm.org/D151751 --- .../LoadStoreVectorizer/AMDGPU/addrspace-7.ll | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/addrspace-7.ll diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/addrspace-7.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/addrspace-7.ll new file mode 100644 index 0000000000000..8e4fd57534562 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/addrspace-7.ll @@ -0,0 +1,14 @@ +; REQUIRES: asserts +; RUN: not --crash opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s +; RUN: not --crash opt -mtriple=amdgcn-amd-amdhsa -aa-pipeline=basic-aa -passes='function(load-store-vectorizer)' -S -o - %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +define { float, float } @f() { +bb: + %l1 = load float, ptr addrspace(7) null + %l2 = load float, ptr addrspace(7) getelementptr (i8, ptr addrspace(7) null, i64 24) + %iv1 = insertvalue { float, float } zeroinitializer, float %l1, 0 + %iv2 = insertvalue { float, float } %iv1, float %l2, 1 + ret { float, float } %iv2 +} From a79b0f9f1d8275b023bcd2bf1763b148d088ad97 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Tue, 30 May 2023 14:19:40 -0700 Subject: [PATCH 149/704] [lldb] Fix build after Clang API change at rev 769d282d7292 --- .../Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp index 7895fc6d59ef7..0af5de4702df6 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp @@ -336,7 +336,7 @@ bool ClangModulesDeclVendorImpl::AddModule(const SourceModule &module, auto file = HS.lookupModuleMapFile(*dir, is_framework); if (!file) return error(); - if (!HS.loadModuleMapFile(file, is_system)) + if (!HS.loadModuleMapFile(*file, is_system)) return error(); } } From e3fbede7f3fd7693d5a15a8cfa0b62d9a4f84877 Mon Sep 17 00:00:00 2001 From: Luke Drummond Date: Tue, 30 May 2023 10:37:47 +0100 Subject: [PATCH 150/704] [HIP] Add missing __hip_atomic_fetch_sub support The rest of the fetch/op intrinsics were added in e13246a2ec3 but sub was conspicuous by its absence. Reviewed By: yaxunl Differential Revision: https://reviews.llvm.org/D151701 --- clang/include/clang/Basic/Builtins.def | 1 + clang/lib/AST/Expr.cpp | 1 + clang/lib/CodeGen/CGAtomic.cpp | 4 ++++ clang/lib/Sema/SemaChecking.cpp | 1 + clang/test/CodeGenCUDA/atomic-ops.cu | 20 ++++++++++++++++++++ 5 files changed, 27 insertions(+) diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def index 15c69c2786476..e8cd200257c2a 100644 --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -910,6 +910,7 @@ ATOMIC_BUILTIN(__hip_atomic_compare_exchange_weak, "v.", "t") ATOMIC_BUILTIN(__hip_atomic_compare_exchange_strong, "v.", "t") ATOMIC_BUILTIN(__hip_atomic_exchange, "v.", "t") ATOMIC_BUILTIN(__hip_atomic_fetch_add, "v.", "t") +ATOMIC_BUILTIN(__hip_atomic_fetch_sub, "v.", "t") ATOMIC_BUILTIN(__hip_atomic_fetch_and, "v.", "t") ATOMIC_BUILTIN(__hip_atomic_fetch_or, "v.", "t") ATOMIC_BUILTIN(__hip_atomic_fetch_xor, "v.", "t") diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 958f4e9042319..c3c00932cee4e 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -4857,6 +4857,7 @@ unsigned AtomicExpr::getNumSubExprs(AtomicOp Op) { case AO__hip_atomic_exchange: case AO__hip_atomic_fetch_add: + case AO__hip_atomic_fetch_sub: case AO__hip_atomic_fetch_and: case AO__hip_atomic_fetch_or: case AO__hip_atomic_fetch_xor: diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp index 8ef95bb808468..0e7eb9723b49e 100644 --- a/clang/lib/CodeGen/CGAtomic.cpp +++ b/clang/lib/CodeGen/CGAtomic.cpp @@ -623,6 +623,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest, : llvm::Instruction::Sub; [[fallthrough]]; case AtomicExpr::AO__c11_atomic_fetch_sub: + case AtomicExpr::AO__hip_atomic_fetch_sub: case AtomicExpr::AO__opencl_atomic_fetch_sub: case AtomicExpr::AO__atomic_fetch_sub: Op = E->getValueType()->isFloatingType() ? llvm::AtomicRMWInst::FSub @@ -897,6 +898,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) { case AtomicExpr::AO__c11_atomic_fetch_add: case AtomicExpr::AO__c11_atomic_fetch_sub: case AtomicExpr::AO__hip_atomic_fetch_add: + case AtomicExpr::AO__hip_atomic_fetch_sub: case AtomicExpr::AO__opencl_atomic_fetch_add: case AtomicExpr::AO__opencl_atomic_fetch_sub: if (MemTy->isPointerType()) { @@ -1013,6 +1015,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) { case AtomicExpr::AO__c11_atomic_fetch_sub: case AtomicExpr::AO__opencl_atomic_fetch_sub: case AtomicExpr::AO__atomic_fetch_sub: + case AtomicExpr::AO__hip_atomic_fetch_sub: case AtomicExpr::AO__c11_atomic_fetch_xor: case AtomicExpr::AO__opencl_atomic_fetch_xor: case AtomicExpr::AO__opencl_atomic_fetch_min: @@ -1218,6 +1221,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) { [[fallthrough]]; case AtomicExpr::AO__c11_atomic_fetch_sub: case AtomicExpr::AO__opencl_atomic_fetch_sub: + case AtomicExpr::AO__hip_atomic_fetch_sub: case AtomicExpr::AO__atomic_fetch_sub: LibCallName = "__atomic_fetch_sub"; AddDirectArgument(*this, Args, UseOptimizedLibcall, Val1.getPointer(), diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index d94e1d0beeaef..c8ebd51a4b3ef 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -6438,6 +6438,7 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange, Form = Copy; break; case AtomicExpr::AO__hip_atomic_fetch_add: + case AtomicExpr::AO__hip_atomic_fetch_sub: case AtomicExpr::AO__hip_atomic_fetch_min: case AtomicExpr::AO__hip_atomic_fetch_max: case AtomicExpr::AO__c11_atomic_fetch_add: diff --git a/clang/test/CodeGenCUDA/atomic-ops.cu b/clang/test/CodeGenCUDA/atomic-ops.cu index 13f4a015386cb..fbc042caa809f 100644 --- a/clang/test/CodeGenCUDA/atomic-ops.cu +++ b/clang/test/CodeGenCUDA/atomic-ops.cu @@ -6,6 +6,7 @@ // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") @@ -18,6 +19,7 @@ __device__ int atomic32_op_singlethread(int *ptr, int val, int desired) { flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); @@ -42,6 +44,7 @@ __device__ unsigned int atomicu32_op_singlethread(unsigned int *ptr, unsigned in // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") @@ -54,6 +57,7 @@ __device__ int atomic32_op_wavefront(int *ptr, int val, int desired) { flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); @@ -78,6 +82,7 @@ __device__ unsigned int atomicu32_op_wavefront(unsigned int *ptr, unsigned int v // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") @@ -89,6 +94,7 @@ __device__ int atomic32_op_workgroup(int *ptr, int val, int desired) { flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); @@ -112,6 +118,7 @@ __device__ unsigned int atomicu32_op_workgroup(unsigned int *ptr, unsigned int v // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") @@ -123,6 +130,7 @@ __device__ int atomic32_op_agent(int *ptr, int val, int desired) { flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -146,6 +154,7 @@ __device__ unsigned int atomicu32_op_agent(unsigned int *ptr, unsigned int val, // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") @@ -158,6 +167,7 @@ __device__ int atomic32_op_system(int *ptr, int val, int desired) { flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); @@ -182,6 +192,7 @@ __device__ unsigned int atomicu32_op_system(unsigned int *ptr, unsigned int val, // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") @@ -193,6 +204,7 @@ __device__ long long atomic64_op_singlethread(long long *ptr, long long *ptr2, l flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); @@ -220,6 +232,7 @@ __device__ unsigned long long atomicu64_op_singlethread(unsigned long long *ptr, // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") @@ -232,6 +245,7 @@ __device__ long long atomic64_op_wavefront(long long *ptr, long long *ptr2, long flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); @@ -260,6 +274,7 @@ __device__ unsigned long long atomicu64_op_wavefront(unsigned long long *ptr, un // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") @@ -271,6 +286,7 @@ __device__ long long atomic64_op_workgroup(long long *ptr, long long *ptr2, long flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); @@ -296,6 +312,7 @@ __device__ unsigned long long atomicu64_op_workgroup(unsigned long long *ptr, un // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") @@ -307,6 +324,7 @@ __device__ long long atomic64_op_agent(long long *ptr, long long *ptr2, long lon flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); @@ -332,6 +350,7 @@ __device__ unsigned long long atomicu64_op_agent(unsigned long long *ptr, unsign // CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8 // CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") // CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") +// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") // CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") // CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") // CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") @@ -344,6 +363,7 @@ __device__ long long atomic64_op_system(long long *ptr, long long *ptr2, long lo flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_exchange(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_add(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); + val = __hip_atomic_fetch_sub(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_and(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_or(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); val = __hip_atomic_fetch_xor(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); From 6b4b63a832f105039442fc983d0b309abe5261d5 Mon Sep 17 00:00:00 2001 From: Rafael Ubal Tena Date: Tue, 30 May 2023 10:43:24 -0700 Subject: [PATCH 151/704] Lowering for 'tosa.scatter' This patch adds support for `tosa.scatter` lowering in the `--tosa-to-scf` pass. Here's an example for this lowering: ``` func.func @tosa( %valuesIn : tensor<3x7x5xi32>, %indices : tensor<3x6xi32>, %input : tensor<3x6x5xi32>) -> tensor<3x7x5xi32> { %0 = "tosa.scatter"(%valuesIn, %indices, %input) : (tensor<3x7x5xi32>, tensor<3x6xi32>, tensor<3x6x5xi32>) -> (tensor<3x7x5xi32>) return %0 : tensor<3x7x5xi32> } ``` translates to func.func @tosa(%arg0: tensor<3x7x5xi32>, %arg1: tensor<3x6xi32>, %arg2: tensor<3x6x5xi32>) -> tensor<3x7x5xi32> { %c0 = arith.constant 0 : index %c3 = arith.constant 3 : index %c1 = arith.constant 1 : index %c6 = arith.constant 6 : index %c2 = arith.constant 2 : index %c5 = arith.constant 5 : index %c0_0 = arith.constant 0 : index %c1_1 = arith.constant 1 : index %0 = scf.for %arg3 = %c0_0 to %c3 step %c1_1 iter_args(%arg4 = %arg0) -> (tensor<3x7x5xi32>) { %1 = scf.for %arg5 = %c0_0 to %c6 step %c1_1 iter_args(%arg6 = %arg4) -> (tensor<3x7x5xi32>) { %extracted = tensor.extract %arg1[%arg3, %arg5] : tensor<3x6xi32> %2 = arith.index_cast %extracted : i32 to index %extracted_slice = tensor.extract_slice %arg2[%arg3, %arg5, %c0_0] [%c1_1, %c1_1, %c5] [%c1_1, %c1_1, %c1_1] : tensor<3x6x5xi32> to tensor %inserted_slice = tensor.insert_slice %extracted_slice into %arg6[%arg3, %2, %c0_0] [%c1_1, %c1_1, %c5] [%c1_1, %c1_1, %c1_1] : tensor into tensor<3x7x5xi32> scf.yield %inserted_slice : tensor<3x7x5xi32> } scf.yield %1 : tensor<3x7x5xi32> } return %0 : tensor<3x7x5xi32> } ``` We have attempted an alternative lowering pass that uses `tensor.scatter` as an intermediate step. However, we opted to aim straight at the `scf` dialect for the following reasons: - The `tensor.scatter` op doesn't seem to be used anywhere. There is no available lowering pass for this op (although we have one that we'll upstream soon). - The `tosa.scatter` and `tensor.scatter` op have different indexing semantics. The `indices` argument of `tosa.scatter` must be non-trivially modified and restructured (e.g. with a `linalg.generic` op) to adapt to the needs of `tensor.scatter`. While this overhead may be simplified and fused after a subsequent `tensor.scatter` lowering, it adds complex logic and an obscure intermediate state. Unless there is a good reason to go through the `tensor` dialect that we're missing, this additional complexity may not be justified. Reviewed By: eric-k256 Differential Revision: https://reviews.llvm.org/D151117 --- mlir/lib/Conversion/TosaToSCF/TosaToSCF.cpp | 73 ++++++++++++++++++- .../Conversion/TosaToSCF/TosaToSCFPass.cpp | 2 +- .../Conversion/TosaToSCF/tosa-to-scf.mlir | 30 ++++++++ 3 files changed, 102 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Conversion/TosaToSCF/TosaToSCF.cpp b/mlir/lib/Conversion/TosaToSCF/TosaToSCF.cpp index 8f10497d99c32..9139bf191fdf1 100644 --- a/mlir/lib/Conversion/TosaToSCF/TosaToSCF.cpp +++ b/mlir/lib/Conversion/TosaToSCF/TosaToSCF.cpp @@ -82,6 +82,75 @@ class IfOpConverter : public OpRewritePattern { } }; +class ScatterOpConverter : public OpRewritePattern { + static Value createTensorDim(OpBuilder &builder, Location loc, Value tensor, + int64_t dim) { + return builder.createOrFold(loc, tensor, dim); + } + + static Value createIndexConst(OpBuilder &builder, Location loc, + int64_t value) { + return builder.create(loc, value); + } + +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::ScatterOp scatter, + PatternRewriter &rewriter) const final { + auto valuesIn = scatter.getValuesIn(); + auto indices = scatter.getIndices(); + auto input = scatter.getInput(); + auto loc = scatter.getLoc(); + + // N, W, C are chosen to match the TOSA spec + auto dimN = createTensorDim(rewriter, loc, input, 0); + auto dimW = createTensorDim(rewriter, loc, input, 1); + auto dimC = createTensorDim(rewriter, loc, input, 2); + + auto zero = createIndexConst(rewriter, loc, 0); + auto one = createIndexConst(rewriter, loc, 1); + + // Loop bounds + auto lbs = llvm::SmallVector(2, zero); + auto steps = llvm::SmallVector(2, one); + auto ubs = llvm::SmallVector{{dimN, dimW}}; + + auto buildBody = [&](OpBuilder &builder, Location loc, ValueRange ivs, + ValueRange args) -> scf::ValueVector { + auto n = ivs[0]; + + // Read the index and cast it to index type + auto index = builder.create(loc, indices, ivs); + auto castIndex = builder.create( + loc, builder.getIndexType(), index); + + // Offset, sizes, and strides for the input tensor + auto inputOffset = llvm::to_vector(ivs); + inputOffset.push_back(zero); + + llvm::SmallVector sizes = {one, one, dimC}; + llvm::SmallVector strides = {one, one, one}; + + auto slice = builder.create( + loc, input, inputOffset, sizes, strides); + + // Insert the slice into the output accumulator tensor. + llvm::SmallVector outputOffset = {n, castIndex, zero}; + auto updated = builder.create( + loc, slice, args[0], outputOffset, sizes, strides); + + return {updated}; + }; + + auto loops = scf::buildLoopNest(rewriter, loc, lbs, ubs, steps, + ValueRange{valuesIn}, buildBody); + rewriter.replaceOp(scatter, loops.results); + + return success(); + } +}; + class WhileOpConverter : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; @@ -106,6 +175,6 @@ class WhileOpConverter : public OpRewritePattern { void mlir::tosa::populateTosaToSCFConversionPatterns( RewritePatternSet *patterns) { - patterns->add(patterns->getContext()); - patterns->add(patterns->getContext()); + patterns->add( + patterns->getContext()); } diff --git a/mlir/lib/Conversion/TosaToSCF/TosaToSCFPass.cpp b/mlir/lib/Conversion/TosaToSCF/TosaToSCFPass.cpp index 759b730556d7a..d14535029132f 100644 --- a/mlir/lib/Conversion/TosaToSCF/TosaToSCFPass.cpp +++ b/mlir/lib/Conversion/TosaToSCF/TosaToSCFPass.cpp @@ -37,7 +37,7 @@ struct TosaToSCF : public impl::TosaToSCFBase { RewritePatternSet patterns(&getContext()); ConversionTarget target(getContext()); target.addLegalDialect(); - target.addIllegalOp(); + target.addIllegalOp(); target.markUnknownOpDynamicallyLegal([](Operation *) { return true; }); auto *op = getOperation(); diff --git a/mlir/test/Conversion/TosaToSCF/tosa-to-scf.mlir b/mlir/test/Conversion/TosaToSCF/tosa-to-scf.mlir index 59931137cdf5b..4f0e29539b6e4 100644 --- a/mlir/test/Conversion/TosaToSCF/tosa-to-scf.mlir +++ b/mlir/test/Conversion/TosaToSCF/tosa-to-scf.mlir @@ -56,3 +56,33 @@ func.func @if_test(%arg0 : tensor, %arg1 : tensor, %arg2 : tensor) return %0 : tensor } + +// ----- + +// CHECK-LABEL: func @scatter_test +// CHECK-SAME: ([[VALUES_IN:%.+]]: tensor<3x7x5xi32>, [[INDICES:%.+]]: tensor<3x6xi32>, [[INPUT:%.+]]: tensor<3x6x5xi32>) +func.func @scatter_test(%values_in: tensor<3x7x5xi32>, %indices : tensor<3x6xi32>, %input: tensor<3x6x5xi32>) -> tensor<3x7x5xi32> { + + // CHECK-DAG: [[C_0:%.+]] = arith.constant 0 : index + // CHECK-DAG: [[C_1:%.+]] = arith.constant 1 : index + // CHECK-DAG: [[C_2:%.+]] = arith.constant 2 : index + // CHECK-DAG: [[C_3:%.+]] = arith.constant 3 : index + // CHECK-DAG: [[C_5:%.+]] = arith.constant 5 : index + // CHECK-DAG: [[C_6:%.+]] = arith.constant 6 : index + // CHECK-DAG: [[C_0_0:%.+]] = arith.constant 0 : index + // CHECK-DAG: [[C_1_0:%.+]] = arith.constant 1 : index + // CHECK: [[RESULT_0:%.+]] = scf.for [[ITER_VAR_0:%.+]] = [[C_0_0]] to [[C_3]] step [[C_1_0]] iter_args([[ITER_ARG_0:%.+]] = [[VALUES_IN]]) -> (tensor<3x7x5xi32>) { + // CHECK: [[RESULT_1:%.+]] = scf.for [[ITER_VAR_1:%.+]] = [[C_0_0]] to [[C_6]] step [[C_1_0]] iter_args([[ITER_ARG_1:%.+]] = [[ITER_ARG_0]]) -> (tensor<3x7x5xi32>) { + // CHECK-DAG: [[EXTRACTED:%.+]] = tensor.extract [[INDICES]][[[ITER_VAR_0]], [[ITER_VAR_1]]] : tensor<3x6xi32> + // CHECK-DAG: [[EXTRACTED_CAST:%.+]] = arith.index_cast [[EXTRACTED]] : i32 to index + // CHECK-DAG: [[EXTRACTED_SLICE:%.+]] = tensor.extract_slice [[INPUT]][[[ITER_VAR_0]], [[ITER_VAR_1]], [[C_0_0]]] [[[C_1_0]], [[C_1_0]], [[C_5]]] [[[C_1_0]], [[C_1_0]], [[C_1_0]]] : tensor<3x6x5xi32> to tensor + // CHECK-DAG: [[INSERTED_SLICE:%.+]] = tensor.insert_slice [[EXTRACTED_SLICE]] into [[ITER_ARG_1]][[[ITER_VAR_0]], [[EXTRACTED_CAST]], [[C_0_0]]] [[[C_1_0]], [[C_1_0]], [[C_5]]] [[[C_1_0]], [[C_1_0]], [[C_1_0]]] : tensor into tensor<3x7x5xi32> + // CHECK: scf.yield [[INSERTED_SLICE]] : tensor<3x7x5xi32> + // CHECK: } + // CHECK: scf.yield [[RESULT_1]] : tensor<3x7x5xi32> + // CHECK: } + %0 = "tosa.scatter"(%values_in, %indices, %input) : (tensor<3x7x5xi32>, tensor<3x6xi32>, tensor<3x6x5xi32>) -> (tensor<3x7x5xi32>) + + // CHECK: return [[RESULT_0]] : tensor<3x7x5xi32> + return %0 : tensor<3x7x5xi32> +} From 463f50b436a2ac3000a90d273f2ed05893e8864f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 30 May 2023 14:38:16 -0700 Subject: [PATCH 152/704] [RISCV] Add RISCVISD::VFWMUL_VL. Use it to replace isel patterns with a DAG combine. This is more consistent with how we handle integer widening multiply. A follow up patch will add support for matching vfwmul when the multiplicand is being squared. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 35 +++++++++++++++++++ llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 ++ .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 23 +++++++++++- 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 3dc04d0f29e93..9d0267912c9fb 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -11355,6 +11355,38 @@ static SDValue performVFMADD_VLCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(2), Mask, VL); } +static SDValue performVFMUL_VLCombine(SDNode *N, SelectionDAG &DAG) { + // FIXME: Ignore strict opcodes for now. + assert(!N->isTargetStrictFPOpcode() && "Unexpected opcode"); + + // Try to form widening multiply. + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue Merge = N->getOperand(2); + SDValue Mask = N->getOperand(3); + SDValue VL = N->getOperand(4); + + if (Op0.getOpcode() != RISCVISD::FP_EXTEND_VL || + Op1.getOpcode() != RISCVISD::FP_EXTEND_VL) + return SDValue(); + + // TODO: Refactor to handle more complex cases similar to + // combineBinOp_VLToVWBinOp_VL. + if (!Op0.hasOneUse() || !Op1.hasOneUse()) + return SDValue(); + + // Check the mask and VL are the same. + if (Op0.getOperand(1) != Mask || Op0.getOperand(2) != VL || + Op1.getOperand(1) != Mask || Op1.getOperand(2) != VL) + return SDValue(); + + Op0 = Op0.getOperand(0); + Op1 = Op1.getOperand(0); + + return DAG.getNode(RISCVISD::VFWMUL_VL, SDLoc(N), N->getValueType(0), Op0, + Op1, Merge, Mask, VL); +} + static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { assert(N->getOpcode() == ISD::SRA && "Unexpected opcode"); @@ -12229,6 +12261,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case RISCVISD::STRICT_VFMSUB_VL: case RISCVISD::STRICT_VFNMSUB_VL: return performVFMADD_VLCombine(N, DAG); + case RISCVISD::FMUL_VL: + return performVFMUL_VLCombine(N, DAG); case ISD::LOAD: case ISD::STORE: { if (DCI.isAfterLegalizeDAG()) @@ -15339,6 +15373,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VWADDU_W_VL) NODE_NAME_CASE(VWSUB_W_VL) NODE_NAME_CASE(VWSUBU_W_VL) + NODE_NAME_CASE(VFWMUL_VL) NODE_NAME_CASE(VNSRL_VL) NODE_NAME_CASE(SETCC_VL) NODE_NAME_CASE(VSELECT_VL) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 829ff1fd46929..af6849cf73e6f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -284,6 +284,8 @@ enum NodeType : unsigned { VWSUB_W_VL, VWSUBU_W_VL, + VFWMUL_VL, + // Narrowing logical shift right. // Operands are (source, shift, passthru, mask, vl) VNSRL_VL, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 76e2a2b4f56b1..b83ae5ff7cddb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -388,6 +388,8 @@ def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCo def riscv_vwsub_vl : SDNode<"RISCVISD::VWSUB_VL", SDT_RISCVVWBinOp_VL, []>; def riscv_vwsubu_vl : SDNode<"RISCVISD::VWSUBU_VL", SDT_RISCVVWBinOp_VL, []>; +def riscv_vfwmul_vl : SDNode<"RISCVISD::VFWMUL_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; + def SDT_RISCVVNBinOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisSameNumEltsAs<0, 1>, SDTCisOpSmallerThanOp<0, 1>, @@ -726,6 +728,7 @@ multiclass VPatBinaryWVL_VV_VX { } } } + multiclass VPatBinaryWVL_VV_VX_WV_WX : VPatBinaryWVL_VV_VX { @@ -1346,6 +1349,24 @@ multiclass VPatWidenReductionVL_Ext_VL { + foreach fvtiToFWti = AllWidenableFloatVectors in { + defvar vti = fvtiToFWti.Vti; + defvar wti = fvtiToFWti.Wti; + let Predicates = !listconcat(GetVTypePredicates.Predicates, + GetVTypePredicates.Predicates) in { + defm : VPatBinaryVL_V; + defm : VPatBinaryVL_VF; + } + } +} + multiclass VPatWidenBinaryFPVL_VV_VF { foreach fvtiToFWti = AllWidenableFloatVectors in { defvar fvti = fvtiToFWti.Vti; @@ -1918,7 +1939,7 @@ defm : VPatBinaryFPVL_VV_VF_E; defm : VPatBinaryFPVL_R_VF_E; // 13.5. Vector Widening Floating-Point Multiply Instructions -defm : VPatWidenBinaryFPVL_VV_VF; +defm : VPatBinaryFPWVL_VV_VF; // 13.6 Vector Single-Width Floating-Point Fused Multiply-Add Instructions. defm : VPatFPMulAddVL_VV_VF; From 1721e72d6e6d0c18ac36155b1f89fd81f45994db Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 27 Apr 2023 14:43:32 -0700 Subject: [PATCH 153/704] [AMDGPU][IGLP] Parameterize the SchedGroup processing / linking order in Solver Currently the PipelineSolver processes SchedGroups in bottom up manner. However, there is no compelling reason to require this. Providing the option to toggle this affords greater experimentation capability, and make usage a bit more intuitive. Importantly, it makes designing rules much easier. Differential Revision: https://reviews.llvm.org/D149393 Change-Id: Ic4abd3408f9faa105c0eef72eab7873d46083ee4 --- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 287 ++++++++++++------ .../CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 138 +++++++++ 2 files changed, 333 insertions(+), 92 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index fc0df61952e48..adbde8efb0bce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -254,6 +254,9 @@ class PipelineSolver { // How many branches we have explored uint64_t BranchesExplored = 0; + // The direction in which we process the candidate SchedGroups per SU + bool IsBottomUp = 1; + // Update indices to fit next conflicting instruction void advancePosition(); // Recede indices to attempt to find better fit for previous conflicting @@ -264,19 +267,35 @@ class PipelineSolver { bool solveExact(); // The polynomial time algorithm which attempts to find a good fit bool solveGreedy(); + // Find the best SchedGroup for the current SU using the heuristic given all + // current information. One step in the greedy algorithm. Templated against + // the SchedGroup iterator (either reverse or forward). + template + void greedyFind(std::vector> &AddedEdges, T I, + T E); // Whether or not the current solution is optimal bool checkOptimal(); // Populate the ready list, prioiritizing fewest missed edges first - void populateReadyList(SUToCandSGsPair &CurrSU, - SmallVectorImpl> &ReadyList, - SmallVectorImpl &SyncPipeline); + // Templated against the SchedGroup iterator (either reverse or forward). + template + void populateReadyList(SmallVectorImpl> &ReadyList, T I, + T E); // Add edges corresponding to the SchedGroups as assigned by solver void makePipeline(); + // Link the SchedGroups in the best found pipeline. + // Tmplated against the SchedGroup iterator (either reverse or forward). + template void linkSchedGroups(T I, T E); // Add the edges from the SU to the other SchedGroups in pipeline, and // return the number of edges missed. int addEdges(SmallVectorImpl &SyncPipeline, SUnit *SU, int SGID, std::vector> &AddedEdges); - // Remove the edges passed via AddedEdges + // Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It + // returns the cost (in terms of missed pipeline edges), and tracks the edges + // added in \p AddedEdges + template + int linkSUnit(SUnit *SU, int SGID, + std::vector> &AddedEdges, T I, T E); + // Remove the edges passed via \p AddedEdges void removeEdges(const std::vector> &AddedEdges); // Convert the passed in maps to arrays for bidirectional iterators void convertSyncMapsToArrays(); @@ -290,9 +309,9 @@ class PipelineSolver { PipelineSolver(DenseMap> &SyncedSchedGroups, DenseMap &SyncedInstrs, - ScheduleDAGMI *DAG) + ScheduleDAGMI *DAG, bool IsBottomUp = 1) : DAG(DAG), SyncedInstrs(SyncedInstrs), - SyncedSchedGroups(SyncedSchedGroups) { + SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { for (auto &PipelineInstrs : SyncedInstrs) { if (PipelineInstrs.second.size() > 0) { @@ -363,14 +382,27 @@ void PipelineSolver::convertSyncMapsToArrays() { } } +template void PipelineSolver::linkSchedGroups(T I, T E) { + for (; I != E; ++I) { + auto &GroupA = *I; + for (auto J = std::next(I); J != E; ++J) { + auto &GroupB = *J; + GroupA.link(GroupB); + } + } +} + void PipelineSolver::makePipeline() { // Preserve the order of barrier for subsequent SchedGroupBarrier mutations for (auto &SyncPipeline : BestPipeline) { for (auto &SG : SyncPipeline) { + LLVM_DEBUG(dbgs() << "Printing SchedGroups\nSchedGroup with SGID " + << SG.getSGID() << " has: \n"); SUnit *SGBarr = nullptr; for (auto &SU : SG.Collection) { if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) SGBarr = SU; + LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); } // Command line requested IGroupLP doesn't have SGBarr if (!SGBarr) @@ -381,43 +413,47 @@ void PipelineSolver::makePipeline() { } for (auto &SyncPipeline : BestPipeline) { - auto I = SyncPipeline.rbegin(); - auto E = SyncPipeline.rend(); - for (; I != E; ++I) { - auto &GroupA = *I; - for (auto J = std::next(I); J != E; ++J) { - auto &GroupB = *J; - GroupA.link(GroupB); - } - } + IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) + : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); } } -int PipelineSolver::addEdges( - SmallVectorImpl &SyncPipeline, SUnit *SU, int SGID, - std::vector> &AddedEdges) { - int AddedCost = 0; +template +int PipelineSolver::linkSUnit( + SUnit *SU, int SGID, std::vector> &AddedEdges, + T I, T E) { bool MakePred = false; - - // The groups in the pipeline are in reverse order. Thus, - // by traversing them from last to first, we are traversing - // them in the order as they were introduced in the code. After we - // pass the group the SU is being assigned to, it should be - // linked as a predecessor of the subsequent SchedGroups - auto GroupNo = (int)SyncPipeline.size() - 1; - for (; GroupNo >= 0; GroupNo--) { - if (SyncPipeline[GroupNo].getSGID() == SGID) { + int AddedCost = 0; + for (; I < E; ++I) { + if (I->getSGID() == SGID) { MakePred = true; continue; } - auto Group = &SyncPipeline[GroupNo]; - AddedCost += Group->link(*SU, MakePred, AddedEdges); + auto Group = *I; + AddedCost += Group.link(*SU, MakePred, AddedEdges); assert(AddedCost >= 0); } - return AddedCost; } +int PipelineSolver::addEdges( + SmallVectorImpl &SyncPipeline, SUnit *SU, int SGID, + std::vector> &AddedEdges) { + + // For IsBottomUp, the first SchedGroup in SyncPipeline contains the + // instructions that are the ultimate successors in the resultant mutation. + // Therefore, in such a configuration, the SchedGroups occurring before the + // candidate SGID are successors of the candidate SchedGroup, thus the current + // SU should be linked as a predecessor to SUs in those SchedGroups. The + // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple + // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using + // IsBottomUp (in reverse). + return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), + SyncPipeline.rend()) + : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), + SyncPipeline.end()); +} + void PipelineSolver::removeEdges( const std::vector> &EdgesToRemove) { // Only remove the edges that we have added when testing @@ -490,12 +526,13 @@ bool PipelineSolver::checkOptimal() { return (DoneExploring || BestCost == 0); } +template void PipelineSolver::populateReadyList( - SUToCandSGsPair &CurrSU, SmallVectorImpl> &ReadyList, - SmallVectorImpl &SyncPipeline) { + SmallVectorImpl> &ReadyList, T I, T E) { + SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; + auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; assert(CurrSU.second.size() >= 1); - auto I = CurrSU.second.rbegin(); - auto E = CurrSU.second.rend(); + for (; I != E; ++I) { std::vector> AddedEdges; int CandSGID = *I; @@ -545,7 +582,10 @@ bool PipelineSolver::solveExact() { // SchedGroup -> Cost pairs SmallVector, 4> ReadyList; // Prioritize the candidate sched groups in terms of lowest cost first - populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]); + IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), + CurrSU.second.rend()) + : populateReadyList(ReadyList, CurrSU.second.begin(), + CurrSU.second.end()); auto I = ReadyList.begin(); auto E = ReadyList.end(); @@ -620,64 +660,71 @@ bool PipelineSolver::solveExact() { return FinishedExploring; } -bool PipelineSolver::solveGreedy() { - BestCost = 0; - std::vector> AddedEdges; +template +void PipelineSolver::greedyFind( + std::vector> &AddedEdges, T I, T E) { + SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; + int BestNodeCost = -1; + int TempCost; + SchedGroup *BestGroup = nullptr; + int BestGroupID = -1; + auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; + LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum + << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); - while (static_cast(CurrSyncGroupIdx) < PipelineInstrs.size()) { - SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; - int BestNodeCost = -1; - int TempCost; - SchedGroup *BestGroup = nullptr; - int BestGroupID = -1; - auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; - LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum - << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); - - // Since we have added the potential SchedGroups from bottom up, but - // traversed the DAG from top down, parse over the groups from last to - // first. If we fail to do this for the greedy algorithm, the solution will - // likely not be good in more complex cases. - auto I = CurrSU.second.rbegin(); - auto E = CurrSU.second.rend(); - for (; I != E; ++I) { - std::vector> AddedEdges; - int CandSGID = *I; - SchedGroup *Match; - for (auto &SG : SyncPipeline) { - if (SG.getSGID() == CandSGID) - Match = &SG; - } + // Since we have added the potential SchedGroups from bottom up, but + // traversed the DAG from top down, parse over the groups from last to + // first. If we fail to do this for the greedy algorithm, the solution will + // likely not be good in more complex cases. + for (; I != E; ++I) { + std::vector> AddedEdges; + int CandSGID = *I; + SchedGroup *Match; + for (auto &SG : SyncPipeline) { + if (SG.getSGID() == CandSGID) + Match = &SG; + } - LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " - << (int)Match->getMask() << "\n"); + LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " + << (int)Match->getMask() << "\n"); - if (Match->isFull()) { - LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); - continue; - } - TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); - LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); - if (TempCost < BestNodeCost || BestNodeCost == -1) { - BestGroup = Match; - BestNodeCost = TempCost; - BestGroupID = CandSGID; - } - removeEdges(AddedEdges); - if (BestNodeCost == 0) - break; + if (Match->isFull()) { + LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); + continue; } + TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); + LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); + if (TempCost < BestNodeCost || BestNodeCost == -1) { + BestGroup = Match; + BestNodeCost = TempCost; + BestGroupID = CandSGID; + } + removeEdges(AddedEdges); + if (BestNodeCost == 0) + break; + } - if (BestGroupID != -1) { - BestGroup->add(*CurrSU.first); - addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); - LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" - << (int)BestGroup->getMask() << "\n"); - BestCost += TempCost; - } else - BestCost += MissPenalty; + if (BestGroupID != -1) { + BestGroup->add(*CurrSU.first); + addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); + LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" + << (int)BestGroup->getMask() << "\n"); + BestCost += TempCost; + } else + BestCost += MissPenalty; - CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; + CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; +} + +bool PipelineSolver::solveGreedy() { + BestCost = 0; + std::vector> AddedEdges; + + while (static_cast(CurrSyncGroupIdx) < PipelineInstrs.size()) { + SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; + IsBottomUp + ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) + : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); advancePosition(); } BestPipeline = CurrPipeline; @@ -721,9 +768,11 @@ void PipelineSolver::solve() { } makePipeline(); + LLVM_DEBUG(dbgs() << "After applying mutation\n"); + LLVM_DEBUG(DAG->dump()); } -enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 }; +enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, DemoOptID = 1 }; // Implement a IGLP scheduling strategy. class IGLPStrategy { @@ -741,6 +790,8 @@ class IGLPStrategy { // Returns true if this strategy should be applied to a ScheduleDAG. virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; + bool IsBottomUp = 1; + IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : DAG(DAG), TII(TII) {} @@ -748,6 +799,7 @@ class IGLPStrategy { }; class MFMASmallGemmOpt final : public IGLPStrategy { +private: public: void applyIGLPStrategy( DenseMap &SyncedInstrs, @@ -756,7 +808,9 @@ class MFMASmallGemmOpt final : public IGLPStrategy { bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : IGLPStrategy(DAG, TII) {} + : IGLPStrategy(DAG, TII) { + IsBottomUp = 1; + } }; void MFMASmallGemmOpt::applyIGLPStrategy( @@ -781,12 +835,51 @@ void MFMASmallGemmOpt::applyIGLPStrategy( } } +class DemoOpt final : public IGLPStrategy { +private: +public: + void applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups) override; + + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } + + DemoOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : IGLPStrategy(DAG, TII) { + IsBottomUp = 0; + } +}; + +void DemoOpt::applyIGLPStrategy( + DenseMap &SyncedInstrs, + DenseMap> &SyncedSchedGroups) { + // Count the number of MFMA instructions. + unsigned MFMACount = 0; + for (const MachineInstr &I : *DAG) + if (TII->isMFMAorWMMA(I)) + ++MFMACount; + + const unsigned PipelineSyncID = 0; + SchedGroup *SG = nullptr; + for (unsigned I = 0; I < MFMACount * 3; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } +} + static std::unique_ptr createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) { switch (ID) { case MFMASmallGemmOptID: return std::make_unique(DAG, TII); + case DemoOptID: + return std::make_unique(DAG, TII); } llvm_unreachable("Unknown IGLPStrategyID"); @@ -829,6 +922,13 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation { public: void apply(ScheduleDAGInstrs *DAGInstrs) override; + // The order in which the PipelineSolver should process the candidate + // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last + // created SchedGroup first, and will consider that as the ultimate + // predecessor group when linking. TOP_DOWN instead links and processes the + // first created SchedGroup first. + bool IsBottomUp = 1; + IGroupLPDAGMutation() = default; }; @@ -908,6 +1008,7 @@ int SchedGroup::link(SUnit &SU, bool MakePred, if (DAG->IsReachable(B, A)) continue; + // tryAddEdge returns false if there is a dependency that makes adding // the A->B edge impossible, otherwise it returns true; bool Added = tryAddEdge(A, B); @@ -1034,7 +1135,7 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { } if (foundSB || foundIGLP) { - PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG); + PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); // PipelineSolver performs the mutation by adding the edges it // determined as the best PS.solve(); @@ -1114,8 +1215,10 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { IGLPStrategyID StrategyID = (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); auto S = createIGLPStrategy(StrategyID, DAG, TII); - if (S->shouldApplyStrategy(DAG)) + if (S->shouldApplyStrategy(DAG)) { + IsBottomUp = S->IsBottomUp; S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups); + } } } // namespace diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index 11008221e8811..ff1a0c54d1acc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -147,6 +147,144 @@ entry: ret void } + +define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_mov_b32_e32 v3, 2.0 +; GCN-NEXT: ; iglp_opt mask(0x00000001) +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 +; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 +; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 +; GCN-NEXT: ds_read_b128 a[20:23], v2 offset:57424 +; GCN-NEXT: ds_read_b128 a[16:19], v2 offset:57408 +; GCN-NEXT: ds_read_b128 a[0:3], v2 offset:57344 +; GCN-NEXT: ds_read_b128 a[4:7], v2 offset:57360 +; GCN-NEXT: ds_read_b128 a[8:11], v2 offset:57376 +; GCN-NEXT: ds_read_b128 a[12:15], v2 offset:57392 +; GCN-NEXT: v_mov_b32_e32 v2, 1.0 +; GCN-NEXT: ds_read_b128 a[60:63], v1 offset:49264 +; GCN-NEXT: ds_read_b128 a[56:59], v1 offset:49248 +; GCN-NEXT: ds_read_b128 a[52:55], v1 offset:49232 +; GCN-NEXT: ds_read_b128 a[48:51], v1 offset:49216 +; GCN-NEXT: ds_read_b128 a[44:47], v1 offset:49200 +; GCN-NEXT: ds_read_b128 a[40:43], v1 offset:49184 +; GCN-NEXT: ds_read_b128 a[36:39], v1 offset:49168 +; GCN-NEXT: ds_read_b128 a[32:35], v1 offset:49152 +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GCN-NEXT: ds_read_b128 a[156:159], v1 offset:112 +; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 +; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 +; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 +; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(4) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] +; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 +; GCN-NEXT: ds_read_b128 a[144:147], v1 offset:64 +; GCN-NEXT: ds_read_b128 a[128:131], v1 +; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:16 +; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:32 +; GCN-NEXT: ds_read_b128 a[140:143], v1 offset:48 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] +; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:8304 +; GCN-NEXT: ds_read_b128 a[120:123], v1 offset:8288 +; GCN-NEXT: ds_read_b128 a[116:119], v1 offset:8272 +; GCN-NEXT: ds_read_b128 a[112:115], v1 offset:8256 +; GCN-NEXT: ds_read_b128 a[108:111], v1 offset:8240 +; GCN-NEXT: ds_read_b128 a[104:107], v1 offset:8224 +; GCN-NEXT: ds_read_b128 a[100:103], v1 offset:8208 +; GCN-NEXT: ds_read_b128 a[96:99], v1 offset:8192 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[96:127], v2, v3, a[96:127] +; GCN-NEXT: ds_read_b128 a[92:95], v1 offset:24688 +; GCN-NEXT: ds_read_b128 a[88:91], v1 offset:24672 +; GCN-NEXT: ds_read_b128 a[84:87], v1 offset:24656 +; GCN-NEXT: ds_read_b128 a[80:83], v1 offset:24640 +; GCN-NEXT: ds_read_b128 a[76:79], v1 offset:24624 +; GCN-NEXT: ds_read_b128 a[72:75], v1 offset:24608 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:112 +; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:96 +; GCN-NEXT: ds_write_b128 v0, a[148:151] offset:80 +; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:64 +; GCN-NEXT: ds_write_b128 v0, a[140:143] offset:48 +; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 +; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 +; GCN-NEXT: ds_write_b128 v0, a[128:131] +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: s_waitcnt lgkmcnt(8) +; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] +; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 +; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:24688 +; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:24640 +; GCN-NEXT: ds_write_b128 v0, a[120:123] offset:8288 +; GCN-NEXT: ds_write_b128 v0, a[124:127] offset:8304 +; GCN-NEXT: ds_write_b128 v0, a[112:115] offset:8256 +; GCN-NEXT: ds_write_b128 v0, a[116:119] offset:8272 +; GCN-NEXT: ds_write_b128 v0, a[104:107] offset:8224 +; GCN-NEXT: ds_write_b128 v0, a[108:111] offset:8240 +; GCN-NEXT: ds_write_b128 v0, a[96:99] offset:8192 +; GCN-NEXT: ds_write_b128 v0, a[100:103] offset:8208 +; GCN-NEXT: ds_write_b128 v0, a[52:55] offset:24656 +; GCN-NEXT: ds_write_b128 v0, a[40:43] offset:24608 +; GCN-NEXT: ds_write_b128 v0, a[44:47] offset:24624 +; GCN-NEXT: ds_write_b128 v0, a[32:35] offset:24576 +; GCN-NEXT: ds_write_b128 v0, a[36:39] offset:24592 +; GCN-NEXT: ds_write_b128 v0, a[24:27] offset:32864 +; GCN-NEXT: ds_write_b128 v0, a[28:31] offset:32880 +; GCN-NEXT: ds_write_b128 v0, a[16:19] offset:32832 +; GCN-NEXT: ds_write_b128 v0, a[88:91] offset:16480 +; GCN-NEXT: ds_write_b128 v0, a[92:95] offset:16496 +; GCN-NEXT: ds_write_b128 v0, a[80:83] offset:16448 +; GCN-NEXT: ds_write_b128 v0, a[84:87] offset:16464 +; GCN-NEXT: ds_write_b128 v0, a[72:75] offset:16416 +; GCN-NEXT: ds_write_b128 v0, a[76:79] offset:16432 +; GCN-NEXT: ds_write_b128 v0, a[64:67] offset:16384 +; GCN-NEXT: ds_write_b128 v0, a[68:71] offset:16400 +; GCN-NEXT: ds_write_b128 v0, a[20:23] offset:32848 +; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32800 +; GCN-NEXT: ds_write_b128 v0, a[12:15] offset:32816 +; GCN-NEXT: ds_write_b128 v0, a[0:3] offset:32768 +; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:32784 +; GCN-NEXT: s_endpgm +entry: + call void @llvm.amdgcn.iglp.opt(i32 1) + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <32 x float>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <32 x float>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <32 x float>, ptr addrspace(3) %load.0.addr, i32 64 + %load.1 = load <32 x float>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <32 x float>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <32 x float>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <32 x float>, ptr addrspace(3) %load.2.addr, i32 192 + %load.3 = load <32 x float>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <32 x float>, ptr addrspace(3) %load.3.addr, i32 256 + %load.4 = load <32 x float>, ptr addrspace(3) %load.4.addr + %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.0, i32 0, i32 0, i32 0) + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.1, i32 0, i32 0, i32 0) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.2, i32 0, i32 0, i32 0) + %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.3, i32 0, i32 0, i32 0) + %mai.4 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %load.4, i32 0, i32 0, i32 0) + %store.0.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 %idx + store <32 x float> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 64 + store <32 x float> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 128 + store <32 x float> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 192 + store <32 x float> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <32 x float>, ptr addrspace(3) %out, i32 256 + store <32 x float> %mai.4, ptr addrspace(3) %store.4.addr + ret void +} + + declare void @llvm.amdgcn.iglp.opt(i32) #1 declare i32 @llvm.amdgcn.workitem.id.x() #1 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1 From 5da248c08f84fb089bb7b32aea39e48df4427699 Mon Sep 17 00:00:00 2001 From: Yuanfang Chen Date: Tue, 30 May 2023 14:40:30 -0700 Subject: [PATCH 154/704] [LLVM Utils] Update Certifi to 2023.5.7 Certifi 2022.12.07 removes root certificates from "TrustCor" from the root store. These are in the process of being removed from Mozilla's trust store. https://groups.google.com/a/mozilla.org/g/dev-security-policy/c/oxX69KFvsm4/m/yLohoVqtCgAJ --- llvm/utils/git/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/git/requirements.txt b/llvm/utils/git/requirements.txt index 08d3e4eeb3f5b..d001cf0f5798d 100644 --- a/llvm/utils/git/requirements.txt +++ b/llvm/utils/git/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=requirements.txt requirements.txt.in # -certifi==2022.9.24 +certifi==2023.5.7 # via requests cffi==1.15.1 # via pynacl From 48e5f704c55f406e0bdbd989434c8b3777b27fe4 Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Tue, 16 May 2023 11:01:13 -0700 Subject: [PATCH 155/704] [lld-macho] Remove linking bitcode support Apple deprecated bitcode in the deployment process in Xcode 14.0. Last month Apple started requiring Xcode 14.1+ to submit apps to the App Store. Since there isn't a use for bundling bitcode outside of submitting to the App Store we should be safe to delete this handling entirely from LLD. Differential Revision: https://reviews.llvm.org/D150697 --- lld/CMakeLists.txt | 4 -- lld/MachO/CMakeLists.txt | 5 -- lld/MachO/Config.h | 1 - lld/MachO/Driver.cpp | 6 -- lld/MachO/Options.td | 45 ++++++------ lld/MachO/SyntheticSections.cpp | 63 ----------------- lld/MachO/SyntheticSections.h | 12 ---- lld/MachO/Writer.cpp | 2 - lld/test/CMakeLists.txt | 1 - lld/test/MachO/bitcode-bundle.ll | 69 ------------------- .../{no-libxar.ll => no-bitcode-support.ll} | 3 +- lld/test/lit.cfg.py | 3 - lld/test/lit.site.cfg.py.in | 1 - 13 files changed, 23 insertions(+), 192 deletions(-) delete mode 100644 lld/test/MachO/bitcode-bundle.ll rename lld/test/MachO/invalid/{no-libxar.ll => no-bitcode-support.ll} (75%) diff --git a/lld/CMakeLists.txt b/lld/CMakeLists.txt index a24330d2f18e9..518289a6328b6 100644 --- a/lld/CMakeLists.txt +++ b/lld/CMakeLists.txt @@ -99,10 +99,6 @@ if(LLD_BUILT_STANDALONE) set(LLVM_INCLUDE_TESTS OFF) endif() endif() - - if(LLVM_HAVE_LIBXAR) - set(XAR_LIB xar) - endif() endif() # standalone set(LLD_TOOLS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt index 91b34f14b7a8a..0b92488b00bea 100644 --- a/lld/MachO/CMakeLists.txt +++ b/lld/MachO/CMakeLists.txt @@ -54,13 +54,8 @@ add_lld_library(lldMachO LINK_LIBS lldCommon ${LLVM_PTHREAD_LIB} - ${XAR_LIB} DEPENDS MachOOptionsTableGen ${tablegen_deps} ) - -if(LLVM_HAVE_LIBXAR) - target_link_libraries(lldMachO PRIVATE ${XAR_LIB}) -endif() diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index feb502d0630d5..762f5a1edd134 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -131,7 +131,6 @@ struct Configuration { bool saveTemps = false; bool adhocCodesign = false; bool emitFunctionStarts = false; - bool emitBitcodeBundle = false; bool emitDataInCodeInfo = false; bool emitEncryptionInfo = false; bool emitInitOffsets = false; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index d614df678080a..687a8b7c6e18f 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -1614,7 +1614,6 @@ bool macho::link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, config->implicitDylibs = !args.hasArg(OPT_no_implicit_dylibs); config->emitFunctionStarts = args.hasFlag(OPT_function_starts, OPT_no_function_starts, true); - config->emitBitcodeBundle = args.hasArg(OPT_bitcode_bundle); config->emitDataInCodeInfo = args.hasFlag(OPT_data_in_code_info, OPT_no_data_in_code_info, true); config->emitChainedFixups = shouldEmitChainedFixups(args); @@ -1654,11 +1653,6 @@ bool macho::link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, args.hasFlag(OPT_encryptable, OPT_no_encryption, is_contained(encryptablePlatforms, config->platform())); -#ifndef LLVM_HAVE_LIBXAR - if (config->emitBitcodeBundle) - error("-bitcode_bundle unsupported because LLD wasn't built with libxar"); -#endif - if (const Arg *arg = args.getLastArg(OPT_install_name)) { if (config->warnDylibInstallName && config->outputType != MH_DYLIB) warn( diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td index 4d4181471c3e8..9108d68205799 100644 --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -644,21 +644,6 @@ def add_ast_path : Separate<["-"], "add_ast_path">, HelpText<"AST paths will be emitted as STABS">, Group; -def grp_bitcode : OptionGroup<"bitcode">, HelpText<"BITCODE BUILD FLOW">; - -def bitcode_bundle : Flag<["-"], "bitcode_bundle">, - HelpText<"Generate an embedded bitcode bundle in the __LLVM,__bundle section of the output">, - Group; -def bitcode_hide_symbols : Flag<["-"], "bitcode_hide_symbols">, - HelpText<"With -bitcode_bundle, hide all non-exported symbols from output bitcode bundle.">, - Flags<[HelpHidden]>, - Group; -def bitcode_symbol_map : Separate<["-"], "bitcode_symbol_map">, - MetaVarName<"">, - HelpText<"Write the bitcode symbol reverse mapping to file , or if a directory, to /UUID.bcsymbolmap">, - Flags<[HelpHidden]>, - Group; - def grp_rare : OptionGroup<"rare">, HelpText<"RARELY USED">; def v : Flag<["-"], "v">, @@ -1200,6 +1185,28 @@ def slow_stubs : Flag<["-"], "slow_stubs">, Flags<[HelpHidden]>, Group; +def bitcode_bundle : Flag<["-"], "bitcode_bundle">, + HelpText<"Obsolete since the App Store no longer supports binaries with embedded bitcode">, + Flags<[HelpHidden]>, + Group; +def bitcode_hide_symbols : Flag<["-"], "bitcode_hide_symbols">, + HelpText<"Obsolete since the App Store no longer supports binaries with embedded bitcode">, + Flags<[HelpHidden]>, + Group; +def bitcode_symbol_map : Separate<["-"], "bitcode_symbol_map">, + MetaVarName<"">, + HelpText<"Obsolete since the App Store no longer supports binaries with embedded bitcode">, + Flags<[HelpHidden]>, + Group; +def bitcode_process_mode : Separate<["-"], "bitcode_process_mode">, + HelpText<"Obsolete since the App Store no longer supports binaries with embedded bitcode">, + Flags<[HelpHidden]>, + Group; +def bitcode_verify : Flag<["-"], "bitcode_verify">, + HelpText<"Obsolete since the App Store no longer supports binaries with embedded bitcode">, + Flags<[HelpHidden]>, + Group; + def grp_undocumented : OptionGroup<"undocumented">, HelpText<"UNDOCUMENTED">; def add_linker_option : Flag<["-"], "add_linker_option">, @@ -1226,14 +1233,6 @@ def allow_simulator_linking_to_macosx_dylibs : Flag<["-"], "allow_simulator_link HelpText<"This option is undocumented in ld64">, Flags<[HelpHidden]>, Group; -def bitcode_process_mode : Separate<["-"], "bitcode_process_mode">, - HelpText<"This option is undocumented in ld64">, - Flags<[HelpHidden]>, - Group; -def bitcode_verify : Flag<["-"], "bitcode_verify">, - HelpText<"This option is undocumented in ld64">, - Flags<[HelpHidden]>, - Group; def classic_linker : Flag<["-"], "classic_linker">, HelpText<"This option is undocumented in ld64">, Flags<[HelpHidden]>, diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index 9aadc29278ff9..f7cad7345fc86 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -35,13 +35,6 @@ #include "llvm/Support/SHA256.h" #endif -#ifdef LLVM_HAVE_LIBXAR -#include -extern "C" { -#include -} -#endif - using namespace llvm; using namespace llvm::MachO; using namespace llvm::support; @@ -1553,62 +1546,6 @@ void CodeSignatureSection::writeTo(uint8_t *buf) const { memset(id + fileName.size(), 0, fileNamePad); } -BitcodeBundleSection::BitcodeBundleSection() - : SyntheticSection(segment_names::llvm, section_names::bitcodeBundle) {} - -class ErrorCodeWrapper { -public: - explicit ErrorCodeWrapper(std::error_code ec) : errorCode(ec.value()) {} - explicit ErrorCodeWrapper(int ec) : errorCode(ec) {} - operator int() const { return errorCode; } - -private: - int errorCode; -}; - -#define CHECK_EC(exp) \ - do { \ - ErrorCodeWrapper ec(exp); \ - if (ec) \ - fatal(Twine("operation failed with error code ") + Twine(ec) + ": " + \ - #exp); \ - } while (0); - -void BitcodeBundleSection::finalize() { -#ifdef LLVM_HAVE_LIBXAR - using namespace llvm::sys::fs; - CHECK_EC(createTemporaryFile("bitcode-bundle", "xar", xarPath)); - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wdeprecated-declarations" - xar_t xar(xar_open(xarPath.data(), O_RDWR)); -#pragma clang diagnostic pop - if (!xar) - fatal("failed to open XAR temporary file at " + xarPath); - CHECK_EC(xar_opt_set(xar, XAR_OPT_COMPRESSION, XAR_OPT_VAL_NONE)); - // FIXME: add more data to XAR - CHECK_EC(xar_close(xar)); - - file_size(xarPath, xarSize); -#endif // defined(LLVM_HAVE_LIBXAR) -} - -void BitcodeBundleSection::writeTo(uint8_t *buf) const { - using namespace llvm::sys::fs; - file_t handle = - CHECK(openNativeFile(xarPath, CD_OpenExisting, FA_Read, OF_None), - "failed to open XAR file"); - std::error_code ec; - mapped_file_region xarMap(handle, mapped_file_region::mapmode::readonly, - xarSize, 0, ec); - if (ec) - fatal("failed to map XAR file"); - memcpy(buf, xarMap.const_data(), xarSize); - - closeFile(handle); - remove(xarPath); -} - CStringSection::CStringSection(const char *name) : SyntheticSection(segment_names::text, name) { flags = S_CSTRING_LITERALS; diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h index 83321889fdde1..e9d564f3c8361 100644 --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -533,18 +533,6 @@ class CodeSignatureSection final : public LinkEditSection { void writeHashes(uint8_t *buf) const; }; -class BitcodeBundleSection final : public SyntheticSection { -public: - BitcodeBundleSection(); - uint64_t getSize() const override { return xarSize; } - void finalize() override; - void writeTo(uint8_t *buf) const override; - -private: - llvm::SmallString<261> xarPath; - uint64_t xarSize; -}; - class CStringSection : public SyntheticSection { public: CStringSection(const char *name); diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index fe7cd39d2db88..68b22bf10c354 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -978,8 +978,6 @@ template void Writer::createOutputSections() { dataInCodeSection = make(); if (config->emitFunctionStarts) functionStartsSection = make(); - if (config->emitBitcodeBundle) - make(); switch (config->outputType) { case MH_EXECUTE: diff --git a/lld/test/CMakeLists.txt b/lld/test/CMakeLists.txt index d290533ada5ad..042bfd9140b6b 100644 --- a/lld/test/CMakeLists.txt +++ b/lld/test/CMakeLists.txt @@ -4,7 +4,6 @@ llvm_canonicalize_cmake_booleans( LLVM_ENABLE_ZSTD LLVM_ENABLE_LIBXML2 LLD_DEFAULT_LD_LLD_IS_MINGW - LLVM_HAVE_LIBXAR LLVM_BUILD_EXAMPLES LLVM_ENABLE_PLUGINS LLVM_BYE_LINK_INTO_TOOLS diff --git a/lld/test/MachO/bitcode-bundle.ll b/lld/test/MachO/bitcode-bundle.ll deleted file mode 100644 index 5deedbfb7b952..0000000000000 --- a/lld/test/MachO/bitcode-bundle.ll +++ /dev/null @@ -1,69 +0,0 @@ -; REQUIRES: x86, xar -; RUN: rm -rf %t; split-file %s %t -; RUN: opt -module-summary %t/test.ll -o %t/test.o -; RUN: opt -module-summary %t/foo.ll -o %t/foo.o -; RUN: %lld -lSystem -bitcode_bundle %t/test.o %t/foo.o -o %t/test -; RUN: llvm-objdump --macho --section=__LLVM,__bundle %t/test | FileCheck %s -; RUN: llvm-readobj --macho-segment %t/test | FileCheck %s --check-prefix=SEGMENT - -; CHECK: Contents of (__LLVM,__bundle) section -; CHECK-NEXT: For (__LLVM,__bundle) section: xar header -; CHECK-NEXT: magic XAR_HEADER_MAGIC -; CHECK-NEXT: size 28 -; CHECK-NEXT: version 1 -; CHECK-NEXT: toc_length_compressed -; CHECK-NEXT: toc_length_uncompressed -; CHECK-NEXT: cksum_alg XAR_CKSUM_SHA1 -; CHECK-NEXT: For (__LLVM,__bundle) section: xar table of contents: -; CHECK-NEXT: -; CHECK-NEXT: -; CHECK-NEXT: -; CHECK-NEXT: -; CHECK-NEXT: 20 -; CHECK-NEXT: 0 -; CHECK-NEXT: -; CHECK-NEXT: {{.*}} -; CHECK-NEXT: -; CHECK-NEXT: - -;; __LLVM must directly precede __LINKEDIT. -; SEGMENT: Name: __LLVM -; SEGMENT-NEXT: Size: 152 -; SEGMENT-NEXT: vmaddr: 0x[[#%X,LLVM_ADDR:]] -; SEGMENT-NEXT: vmsize: 0x[[#%X,LLVM_VMSIZE:]] -; SEGMENT-NEXT: fileoff: [[#LLVM_OFF:]] -; SEGMENT-NEXT: filesize: [[#LLVM_FILESIZE:]] -; SEGMENT-NEXT: maxprot: rw- -; SEGMENT-NEXT: initprot: rw- -; SEGMENT-NEXT: nsects: 1 -; SEGMENT-NEXT: flags: 0x0 -; SEGMENT-NEXT: } -; SEGMENT-NEXT: Segment { -; SEGMENT-NEXT: Cmd: LC_SEGMENT_64 -; SEGMENT-NEXT: Name: __LINKEDIT -; SEGMENT-NEXT: Size: 72 -; SEGMENT-NEXT: vmaddr: 0x[[#LLVM_ADDR + LLVM_VMSIZE]] -; SEGMENT-NEXT: vmsize: -; SEGMENT-NEXT: fileoff: [[#LLVM_OFF + LLVM_FILESIZE]] -; SEGMENT-NEXT: filesize: -; SEGMENT-NEXT: maxprot: r-- -; SEGMENT-NEXT: initprot: r-- -; SEGMENT-NEXT: nsects: 0 -; SEGMENT-NEXT: flags: 0x0 -; SEGMENT-NEXT: } - -;--- foo.ll -target triple = "x86_64-apple-darwin" -target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" - -define void @foo() { - ret void -} - -;--- test.ll -target triple = "x86_64-apple-darwin" -target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" - -define void @main() { - ret void -} diff --git a/lld/test/MachO/invalid/no-libxar.ll b/lld/test/MachO/invalid/no-bitcode-support.ll similarity index 75% rename from lld/test/MachO/invalid/no-libxar.ll rename to lld/test/MachO/invalid/no-bitcode-support.ll index 62a2599c62e2f..466441dcd2cb0 100644 --- a/lld/test/MachO/invalid/no-libxar.ll +++ b/lld/test/MachO/invalid/no-bitcode-support.ll @@ -1,8 +1,7 @@ ; REQUIRES: x86 -; UNSUPPORTED: xar ; RUN: opt -module-summary %s -o %t.o ; RUN: not %lld -lSystem -bitcode_bundle %t.o -o /dev/null 2>&1 | FileCheck %s -; CHECK: error: -bitcode_bundle unsupported because LLD wasn't built with libxar +; CHECK: error: Option `-bitcode_bundle' is obsolete. Please modernize your usage. target triple = "x86_64-apple-darwin" target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py index 2e60d9fef7dfa..e03a6dcf47fd9 100644 --- a/lld/test/lit.cfg.py +++ b/lld/test/lit.cfg.py @@ -125,9 +125,6 @@ if config.enable_backtrace: config.available_features.add("backtrace") -if config.have_libxar: - config.available_features.add("xar") - if config.have_libxml2: config.available_features.add("libxml2") diff --git a/lld/test/lit.site.cfg.py.in b/lld/test/lit.site.cfg.py.in index c324b53021a17..39b54924f4a19 100644 --- a/lld/test/lit.site.cfg.py.in +++ b/lld/test/lit.site.cfg.py.in @@ -20,7 +20,6 @@ config.target_triple = "@LLVM_TARGET_TRIPLE@" config.python_executable = "@Python3_EXECUTABLE@" config.have_zlib = @LLVM_ENABLE_ZLIB@ config.have_zstd = @LLVM_ENABLE_ZSTD@ -config.have_libxar = @LLVM_HAVE_LIBXAR@ config.have_libxml2 = @LLVM_ENABLE_LIBXML2@ config.sizeof_void_p = @CMAKE_SIZEOF_VOID_P@ config.ld_lld_default_mingw = @LLD_DEFAULT_LD_LLD_IS_MINGW@ From bf63b15bd4bf4ca0b0d56319af74eb259e0b6d3e Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Tue, 30 May 2023 09:27:10 -0700 Subject: [PATCH 156/704] [libc++][PSTL] Implement std::merge Reviewed By: ldionne, #libc Spies: pcwang-thead, libcxx-commits Differential Revision: https://reviews.llvm.org/D151375 --- libcxx/include/CMakeLists.txt | 2 + libcxx/include/__algorithm/pstl_backend.h | 9 ++ .../__algorithm/pstl_backends/cpu_backend.h | 15 ++ .../pstl_backends/cpu_backends/merge.h | 79 +++++++++++ .../pstl_backends/cpu_backends/serial.h | 16 +++ .../pstl_backends/cpu_backends/thread.h | 16 +++ libcxx/include/__algorithm/pstl_merge.h | 56 ++++++++ .../include/__pstl/internal/algorithm_impl.h | 83 ----------- .../__pstl/internal/glue_algorithm_impl.h | 31 ---- .../__pstl/internal/parallel_backend_serial.h | 10 -- libcxx/include/algorithm | 1 + libcxx/include/module.modulemap.in | 3 + libcxx/test/libcxx/private_headers.verify.cpp | 1 + .../alg.sorting/alg.merge/pstl.merge.pass.cpp | 132 ++++++++++++++++++ 14 files changed, 330 insertions(+), 124 deletions(-) create mode 100644 libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h create mode 100644 libcxx/include/__algorithm/pstl_merge.h create mode 100644 libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.merge.pass.cpp diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 1939a5569fe2f..94b3944d6f09c 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -77,6 +77,7 @@ set(files __algorithm/pstl_backends/cpu_backends/fill.h __algorithm/pstl_backends/cpu_backends/find_if.h __algorithm/pstl_backends/cpu_backends/for_each.h + __algorithm/pstl_backends/cpu_backends/merge.h __algorithm/pstl_backends/cpu_backends/serial.h __algorithm/pstl_backends/cpu_backends/thread.h __algorithm/pstl_backends/cpu_backends/transform.h @@ -85,6 +86,7 @@ set(files __algorithm/pstl_find.h __algorithm/pstl_for_each.h __algorithm/pstl_frontend_dispatch.h + __algorithm/pstl_merge.h __algorithm/pstl_transform.h __algorithm/push_heap.h __algorithm/ranges_adjacent_find.h diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h index f57477310963c..ae37e56a79499 100644 --- a/libcxx/include/__algorithm/pstl_backend.h +++ b/libcxx/include/__algorithm/pstl_backend.h @@ -72,6 +72,15 @@ implemented, all the algorithms will eventually forward to the basis algorithms template void __pstl_fill_n(_Backend, _Iterator __first, _SizeT __n, const _Tp& __value); + template + _OutIterator __pstl_merge(_Backend, + _Iterator1 __first1, + _Iterator1 __last1, + _Iterator2 __first2, + _Iterator2 __last2, + _OutIterator __result, + _Comp __comp); + // TODO: Complete this list */ diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h index a15e7f8f1acd2..3939b82110b49 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h @@ -20,6 +20,20 @@ // Cancel the execution of other jobs - they aren't needed anymore void __cancel_execution(); + template + void __parallel_merge( + _RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, + _RandomAccessIterator2 __last2, + _RandomAccessIterator3 __outit, + _Compare __comp, + _LeafMerge __leaf_merge); + TODO: Document the parallel backend */ @@ -27,6 +41,7 @@ #include <__algorithm/pstl_backends/cpu_backends/fill.h> #include <__algorithm/pstl_backends/cpu_backends/find_if.h> #include <__algorithm/pstl_backends/cpu_backends/for_each.h> +#include <__algorithm/pstl_backends/cpu_backends/merge.h> #include <__algorithm/pstl_backends/cpu_backends/transform.h> #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h new file mode 100644 index 0000000000000..d5be1e302d353 --- /dev/null +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_MERGE_H +#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_MERGE_H + +#include <__algorithm/merge.h> +#include <__algorithm/pstl_backends/cpu_backends/backend.h> +#include <__config> +#include <__iterator/iterator_traits.h> +#include <__type_traits/is_execution_policy.h> +#include <__utility/move.h> +#include <__utility/terminate_on_exception.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +_LIBCPP_BEGIN_NAMESPACE_STD + +template +_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_merge( + __cpu_backend_tag, + _ForwardIterator1 __first1, + _ForwardIterator1 __last1, + _ForwardIterator2 __first2, + _ForwardIterator2 __last2, + _ForwardOutIterator __result, + _Comp __comp) { + if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category<_ForwardIterator1>::value && + __has_random_access_iterator_category<_ForwardIterator2>::value && + __has_random_access_iterator_category<_ForwardOutIterator>::value) { + return std::__terminate_on_exception([&] { + __par_backend::__parallel_merge( + __first1, + __last1, + __first2, + __last2, + __result, + __comp, + [](_ForwardIterator1 __g_first1, + _ForwardIterator1 __g_last1, + _ForwardIterator2 __g_first2, + _ForwardIterator2 __g_last2, + _ForwardOutIterator __g_result, + _Comp __g_comp) { + return std::__pstl_merge<__remove_parallel_policy_t<_ExecutionPolicy>>( + __cpu_backend_tag{}, + std::move(__g_first1), + std::move(__g_last1), + std::move(__g_first2), + std::move(__g_last2), + std::move(__g_result), + std::move(__g_comp)); + }); + return __result + (__last1 - __first1) + (__last2 - __first2); + }); + } else { + return std::merge(__first1, __last1, __first2, __last2, __result, __comp); + } +} + +_LIBCPP_END_NAMESPACE_STD + +#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_MERGE_H diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h index ccd24cb15ba89..0c3aafae6c137 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h @@ -30,6 +30,22 @@ _LIBCPP_HIDE_FROM_ABI void __parallel_for(_RandomAccessIterator __first, _Random _LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {} +template +_LIBCPP_HIDE_FROM_ABI void __parallel_merge( + _RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, + _RandomAccessIterator2 __last2, + _RandomAccessIterator3 __outit, + _Compare __comp, + _LeafMerge __leaf_merge) { + __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp); +} + // TODO: Complete this list } // namespace __serial_cpu_backend diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h index 967ce8c9acc28..93745d3068862 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h @@ -33,6 +33,22 @@ _LIBCPP_HIDE_FROM_ABI void __parallel_for(_RandomAccessIterator __first, _Random _LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {} +template +_LIBCPP_HIDE_FROM_ABI void __parallel_merge( + _RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, + _RandomAccessIterator2 __last2, + _RandomAccessIterator3 __outit, + _Compare __comp, + _LeafMerge __leaf_merge) { + __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp); +} + } // namespace __thread_cpu_backend } // namespace __par_backend diff --git a/libcxx/include/__algorithm/pstl_merge.h b/libcxx/include/__algorithm/pstl_merge.h new file mode 100644 index 0000000000000..b5585eeec83d6 --- /dev/null +++ b/libcxx/include/__algorithm/pstl_merge.h @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ALGORITHM_PSTL_MERGE_H +#define _LIBCPP___ALGORITHM_PSTL_MERGE_H + +#include <__algorithm/pstl_backend.h> +#include <__config> +#include <__functional/operations.h> +#include <__type_traits/is_execution_policy.h> +#include <__type_traits/remove_cvref.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +_LIBCPP_BEGIN_NAMESPACE_STD + +template , + class _RawPolicy = __remove_cvref_t<_ExecutionPolicy>, + enable_if_t, int> = 0> +_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator +merge(_ExecutionPolicy&&, + _ForwardIterator1 __first1, + _ForwardIterator1 __last1, + _ForwardIterator2 __first2, + _ForwardIterator2 __last2, + _ForwardOutIterator __result, + _Comp __comp = {}) { + using _Backend = typename __select_backend<_RawPolicy>::type; + return std::__pstl_merge<_RawPolicy>( + _Backend{}, + std::move(__first1), + std::move(__last1), + std::move(__first2), + std::move(__last2), + std::move(__result), + std::move(__comp)); +} + +_LIBCPP_END_NAMESPACE_STD + +#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +#endif // _LIBCPP___ALGORITHM_PSTL_MERGE_H diff --git a/libcxx/include/__pstl/internal/algorithm_impl.h b/libcxx/include/__pstl/internal/algorithm_impl.h index 6f35324b12238..99cff2b114131 100644 --- a/libcxx/include/__pstl/internal/algorithm_impl.h +++ b/libcxx/include/__pstl/internal/algorithm_impl.h @@ -2869,89 +2869,6 @@ _RandomAccessIterator __pattern_remove_if( }); } -//------------------------------------------------------------------------ -// merge -//------------------------------------------------------------------------ - -template -_OutputIterator __brick_merge( - _ForwardIterator1 __first1, - _ForwardIterator1 __last1, - _ForwardIterator2 __first2, - _ForwardIterator2 __last2, - _OutputIterator __d_first, - _Compare __comp, - /* __is_vector = */ std::false_type) noexcept { - return std::merge(__first1, __last1, __first2, __last2, __d_first, __comp); -} - -template -_RandomAccessIterator3 __brick_merge( - _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __d_first, - _Compare __comp, - /* __is_vector = */ std::true_type) noexcept { - // TODO: vectorize - return std::merge(__first1, __last1, __first2, __last2, __d_first, __comp); -} - -template -_OutputIterator __pattern_merge( - _Tag, - _ExecutionPolicy&&, - _ForwardIterator1 __first1, - _ForwardIterator1 __last1, - _ForwardIterator2 __first2, - _ForwardIterator2 __last2, - _OutputIterator __d_first, - _Compare __comp) noexcept { - return __internal::__brick_merge( - __first1, __last1, __first2, __last2, __d_first, __comp, typename _Tag::__is_vector{}); -} - -template -_RandomAccessIterator3 __pattern_merge( - __parallel_tag<_IsVector> __tag, - _ExecutionPolicy&& __exec, - _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __d_first, - _Compare __comp) { - using __backend_tag = typename decltype(__tag)::__backend_tag; - - __par_backend::__parallel_merge( - __backend_tag{}, - std::forward<_ExecutionPolicy>(__exec), - __first1, - __last1, - __first2, - __last2, - __d_first, - __comp, - [](_RandomAccessIterator1 __f1, - _RandomAccessIterator1 __l1, - _RandomAccessIterator2 __f2, - _RandomAccessIterator2 __l2, - _RandomAccessIterator3 __f3, - _Compare __comp) { return __internal::__brick_merge(__f1, __l1, __f2, __l2, __f3, __comp, _IsVector{}); }); - return __d_first + (__last1 - __first1) + (__last2 - __first2); -} - //------------------------------------------------------------------------ // inplace_merge //------------------------------------------------------------------------ diff --git a/libcxx/include/__pstl/internal/glue_algorithm_impl.h b/libcxx/include/__pstl/internal/glue_algorithm_impl.h index 942ea0dea50bf..4b17133079dd4 100644 --- a/libcxx/include/__pstl/internal/glue_algorithm_impl.h +++ b/libcxx/include/__pstl/internal/glue_algorithm_impl.h @@ -763,37 +763,6 @@ is_sorted(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator } // [alg.merge] -template -__pstl::__internal::__enable_if_execution_policy<_ExecutionPolicy, _ForwardIterator> -merge(_ExecutionPolicy&& __exec, - _ForwardIterator1 __first1, - _ForwardIterator1 __last1, - _ForwardIterator2 __first2, - _ForwardIterator2 __last2, - _ForwardIterator __d_first, - _Compare __comp) { - auto __dispatch_tag = __pstl::__internal::__select_backend(__exec, __first1, __first2, __d_first); - - return __pstl::__internal::__pattern_merge( - __dispatch_tag, std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __d_first, __comp); -} - -template -__pstl::__internal::__enable_if_execution_policy<_ExecutionPolicy, _ForwardIterator> -merge(_ExecutionPolicy&& __exec, - _ForwardIterator1 __first1, - _ForwardIterator1 __last1, - _ForwardIterator2 __first2, - _ForwardIterator2 __last2, - _ForwardIterator __d_first) { - return std::merge( - std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __d_first, std::less<>()); -} - template __pstl::__internal::__enable_if_execution_policy<_ExecutionPolicy, void> inplace_merge(_ExecutionPolicy&& __exec, diff --git a/libcxx/include/__pstl/internal/parallel_backend_serial.h b/libcxx/include/__pstl/internal/parallel_backend_serial.h index b3ecb82175a64..4714ed4194fdb 100644 --- a/libcxx/include/__pstl/internal/parallel_backend_serial.h +++ b/libcxx/include/__pstl/internal/parallel_backend_serial.h @@ -98,16 +98,6 @@ __parallel_stable_sort(__pstl::__internal::__serial_backend_tag, _ExecutionPolic __leaf_sort(__first, __last, __comp); } -template -_LIBCPP_HIDE_FROM_ABI void -__parallel_merge(__pstl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __outit, _Compare __comp, _LeafMerge __leaf_merge) -{ - __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp); -} - template _LIBCPP_HIDE_FROM_ABI void __parallel_invoke(__pstl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _F1&& __f1, _F2&& __f2) diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm index 190214021df73..0f52da9a17d85 100644 --- a/libcxx/include/algorithm +++ b/libcxx/include/algorithm @@ -1805,6 +1805,7 @@ template #include <__algorithm/pstl_fill.h> #include <__algorithm/pstl_find.h> #include <__algorithm/pstl_for_each.h> +#include <__algorithm/pstl_merge.h> #include <__algorithm/pstl_transform.h> #include <__algorithm/push_heap.h> #include <__algorithm/ranges_adjacent_find.h> diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in index 743fce27f7133..814b611296168 100644 --- a/libcxx/include/module.modulemap.in +++ b/libcxx/include/module.modulemap.in @@ -338,6 +338,9 @@ module std [system] { module pstl_backends_cpu_backends_for_each { private header "__algorithm/pstl_backends/cpu_backends/for_each.h" } + module pstl_backends_cpu_backends_merge { + private header "__algorithm/pstl_backends/cpu_backends/merge.h" + } module pstl_backends_cpu_backends_serial { private header "__algorithm/pstl_backends/cpu_backends/serial.h" } diff --git a/libcxx/test/libcxx/private_headers.verify.cpp b/libcxx/test/libcxx/private_headers.verify.cpp index 2c79212175ba4..972c91d6c8f96 100644 --- a/libcxx/test/libcxx/private_headers.verify.cpp +++ b/libcxx/test/libcxx/private_headers.verify.cpp @@ -120,6 +120,7 @@ END-SCRIPT #include <__algorithm/pstl_backends/cpu_backends/fill.h> // expected-error@*:* {{use of private header from outside its module: '__algorithm/pstl_backends/cpu_backends/fill.h'}} #include <__algorithm/pstl_backends/cpu_backends/find_if.h> // expected-error@*:* {{use of private header from outside its module: '__algorithm/pstl_backends/cpu_backends/find_if.h'}} #include <__algorithm/pstl_backends/cpu_backends/for_each.h> // expected-error@*:* {{use of private header from outside its module: '__algorithm/pstl_backends/cpu_backends/for_each.h'}} +#include <__algorithm/pstl_backends/cpu_backends/merge.h> // expected-error@*:* {{use of private header from outside its module: '__algorithm/pstl_backends/cpu_backends/merge.h'}} #include <__algorithm/pstl_backends/cpu_backends/serial.h> // expected-error@*:* {{use of private header from outside its module: '__algorithm/pstl_backends/cpu_backends/serial.h'}} #include <__algorithm/pstl_backends/cpu_backends/thread.h> // expected-error@*:* {{use of private header from outside its module: '__algorithm/pstl_backends/cpu_backends/thread.h'}} #include <__algorithm/pstl_backends/cpu_backends/transform.h> // expected-error@*:* {{use of private header from outside its module: '__algorithm/pstl_backends/cpu_backends/transform.h'}} diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.merge.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.merge.pass.cpp new file mode 100644 index 0000000000000..aa16208b90d0f --- /dev/null +++ b/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.merge.pass.cpp @@ -0,0 +1,132 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 +// UNSUPPORTED: libcpp-has-no-incomplete-pstl + +// template +// ForwardIterator +// merge(ExecutionPolicy&& exec, +// ForwardIterator1 first1, ForwardIterator1 last1, +// ForwardIterator2 first2, ForwardIterator2 last2, +// ForwardIterator result); +// +// template +// ForwardIterator +// merge(ExecutionPolicy&& exec, +// ForwardIterator1 first1, ForwardIterator1 last1, +// ForwardIterator2 first2, ForwardIterator2 last2, +// ForwardIterator result, Compare comp); + +#include +#include +#include +#include +#include +#include + +#include "type_algorithms.h" +#include "test_execution_policies.h" +#include "test_iterators.h" + +template +struct Test { + template + void operator()(Policy&& policy) { + { // simple test + int a[] = {1, 3, 5, 7, 9}; + int b[] = {2, 4, 6, 8, 10}; + std::array out; + std::merge( + policy, Iter1(std::begin(a)), Iter1(std::end(a)), Iter2(std::begin(b)), Iter2(std::end(b)), std::begin(out)); + assert((out == std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10})); + } + + { // check that it works with the first range being empty + std::array a; + int b[] = {2, 4, 6, 8, 10}; + std::array out; + std::merge( + policy, Iter1(std::begin(a)), Iter1(std::end(a)), Iter2(std::begin(b)), Iter2(std::end(b)), std::begin(out)); + assert((out == std::array{2, 4, 6, 8, 10})); + } + + { // check that it works with the second range being empty + int a[] = {2, 4, 6, 8, 10}; + std::array b; + std::array out; + std::merge( + policy, Iter1(std::begin(a)), Iter1(std::end(a)), Iter2(std::begin(b)), Iter2(std::end(b)), std::begin(out)); + assert((out == std::array{2, 4, 6, 8, 10})); + } + + { // check that it works when the ranges don't have the same length + int a[] = {2, 4, 6, 8, 10}; + int b[] = {3, 4}; + std::array out; + std::merge( + policy, Iter1(std::begin(a)), Iter1(std::end(a)), Iter2(std::begin(b)), Iter2(std::end(b)), std::begin(out)); + assert((out == std::array{2, 3, 4, 4, 6, 8, 10})); + } + + { // check that large ranges work + std::vector a(100); + std::vector b(100); + { + int i = 0; + for (auto& e : a) { + e = i; + i += 2; + } + } + + { + int i = 1; + for (auto& e : b) { + e = i; + i += 2; + } + } + + std::vector out(std::size(a) + std::size(b)); + std::merge( + Iter1(a.data()), Iter1(a.data() + a.size()), Iter2(b.data()), Iter2(b.data() + b.size()), std::begin(out)); + std::vector expected(200); + std::iota(expected.begin(), expected.end(), 0); + assert(std::equal(out.begin(), out.end(), expected.begin())); + } + + { // check that the predicate is used + int a[] = {10, 9, 8, 7}; + int b[] = {8, 4, 3}; + std::array out; + std::merge( + policy, + Iter1(std::begin(a)), + Iter1(std::end(a)), + Iter2(std::begin(b)), + Iter2(std::end(b)), + std::begin(out), + std::greater{}); + assert((out == std::array{10, 9, 8, 8, 7, 4, 3})); + } + } +}; + +int main(int, char**) { + types::for_each(types::forward_iterator_list{}, types::apply_type_identity{[](auto v) { + using Iter = typename decltype(v)::type; + types::for_each( + types::forward_iterator_list{}, + TestIteratorWithPolicies::template apply>{}); + }}); + + return 0; +} From 454163354b0b2755746f9b3c32059adff4d34bd3 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 30 May 2023 15:08:08 -0700 Subject: [PATCH 157/704] [RISCV] Allow FWMUL formation for an FP extend used twice by the same multiply. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 +- .../CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll | 41 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 9d0267912c9fb..229345159280d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -11372,7 +11372,8 @@ static SDValue performVFMUL_VLCombine(SDNode *N, SelectionDAG &DAG) { // TODO: Refactor to handle more complex cases similar to // combineBinOp_VLToVWBinOp_VL. - if (!Op0.hasOneUse() || !Op1.hasOneUse()) + if ((!Op0.hasOneUse() || !Op1.hasOneUse()) && + (Op0 != Op1 || !Op0->hasNUsesOfValue(2, 0))) return SDValue(); // Check the mask and VL are the same. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll index 83f6571bd3259..c45349f975b50 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll @@ -391,3 +391,44 @@ define <32 x double> @vfwmul_vf_v32f32(ptr %x, float %y) { %f = fmul <32 x double> %d, %e ret <32 x double> %f } + +define <2 x float> @vfwmul_squared_v2f16_v2f32(ptr %x) { +; CHECK-LABEL: vfwmul_squared_v2f16_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vfwmul.vv v8, v9, v9 +; CHECK-NEXT: ret + %a = load <2 x half>, ptr %x + %b = fpext <2 x half> %a to <2 x float> + %c = fmul <2 x float> %b, %b + ret <2 x float> %c +} + +define <2 x double> @vfwmul_squared_v2f32_v2f64(ptr %x) { +; CHECK-LABEL: vfwmul_squared_v2f32_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vfwmul.vv v8, v9, v9 +; CHECK-NEXT: ret + %a = load <2 x float>, ptr %x + %b = fpext <2 x float> %a to <2 x double> + %c = fmul <2 x double> %b, %b + ret <2 x double> %c +} + +define <2 x double> @vfwmul_squared_v2f16_v2f64(ptr %x) { +; CHECK-LABEL: vfwmul_squared_v2f16_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwmul.vv v8, v9, v9 +; CHECK-NEXT: ret + %a = load <2 x half>, ptr %x + %b = fpext <2 x half> %a to <2 x double> + %c = fmul <2 x double> %b, %b + ret <2 x double> %c +} From 57154a63a07f732552968141136279350bcdf99d Mon Sep 17 00:00:00 2001 From: Alex Langford Date: Wed, 24 May 2023 19:42:15 -0700 Subject: [PATCH 158/704] [lldb] Introduce FileSpec::GetComponents This patch introduces FileSpec::GetComponents, a method that splits a FileSpec's path into its individual components. For example, given /foo/bar/baz, you'll get back a vector of strings {"foo", "bar", baz"}. The motivation here is to reduce the use of `FileSpec::RemoveLastPathComponent`. Mutating a FileSpec is expensive, so providing a way of doing this without mutation is useful. Differential Revision: https://reviews.llvm.org/D151399 --- lldb/include/lldb/Utility/FileSpec.h | 12 ++++++++ .../Platform/MacOSX/PlatformDarwin.cpp | 10 ++----- lldb/source/Utility/FileSpec.cpp | 20 +++++++++++++ lldb/unittests/Utility/FileSpecTest.cpp | 30 +++++++++++++++++++ 4 files changed, 65 insertions(+), 7 deletions(-) diff --git a/lldb/include/lldb/Utility/FileSpec.h b/lldb/include/lldb/Utility/FileSpec.h index 919b5e8564583..6eb5b805d9d9f 100644 --- a/lldb/include/lldb/Utility/FileSpec.h +++ b/lldb/include/lldb/Utility/FileSpec.h @@ -408,6 +408,18 @@ class FileSpec { /// A boolean value indicating whether the path was updated. bool RemoveLastPathComponent(); + /// Gets the components of the FileSpec's path. + /// For example, given the path: + /// /System/Library/PrivateFrameworks/UIFoundation.framework/UIFoundation + /// + /// This function returns: + /// {"System", "Library", "PrivateFrameworks", "UIFoundation.framework", + /// "UIFoundation"} + /// \return + /// A std::vector of llvm::StringRefs for each path component. + /// The lifetime of the StringRefs is tied to the lifetime of the FileSpec. + std::vector GetComponents() const; + protected: // Convenience method for setting the file without changing the style. void SetFile(llvm::StringRef path); diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp index 76c6b535679a6..60327fbe3124f 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp @@ -1236,13 +1236,9 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths( // "UIFoundation" and "UIFoundation.framework" -- most likely the latter // will be the one we find there. - FileSpec platform_pull_upart(platform_file); - std::vector path_parts; - path_parts.push_back(platform_pull_upart.GetFilename().AsCString()); - while (platform_pull_upart.RemoveLastPathComponent()) { - ConstString part = platform_pull_upart.GetFilename(); - path_parts.push_back(part.AsCString()); - } + std::vector path_parts = platform_file.GetComponents(); + // We want the components in reverse order. + std::reverse(path_parts.begin(), path_parts.end()); const size_t path_parts_size = path_parts.size(); size_t num_module_search_paths = module_search_paths_ptr->GetSize(); diff --git a/lldb/source/Utility/FileSpec.cpp b/lldb/source/Utility/FileSpec.cpp index 6688e45650505..707033c1f8d6b 100644 --- a/lldb/source/Utility/FileSpec.cpp +++ b/lldb/source/Utility/FileSpec.cpp @@ -463,6 +463,26 @@ bool FileSpec::RemoveLastPathComponent() { } return false; } + +std::vector FileSpec::GetComponents() const { + std::vector components; + + auto dir_begin = llvm::sys::path::begin(m_directory.GetStringRef(), m_style); + auto dir_end = llvm::sys::path::end(m_directory.GetStringRef()); + + for (auto iter = dir_begin; iter != dir_end; ++iter) { + if (*iter == "/" || *iter == ".") + continue; + + components.push_back(*iter); + } + + if (!m_filename.IsEmpty() && m_filename != "/" && m_filename != ".") + components.push_back(m_filename.GetStringRef()); + + return components; +} + /// Returns true if the filespec represents an implementation source /// file (files with a ".c", ".cpp", ".m", ".mm" (many more) /// extension). diff --git a/lldb/unittests/Utility/FileSpecTest.cpp b/lldb/unittests/Utility/FileSpecTest.cpp index ffd3d343dc188..2a62f6b1e7612 100644 --- a/lldb/unittests/Utility/FileSpecTest.cpp +++ b/lldb/unittests/Utility/FileSpecTest.cpp @@ -504,3 +504,33 @@ TEST(FileSpecTest, TestIsSourceImplementationFile) { EXPECT_FALSE(win_noext.IsSourceImplementationFile()); EXPECT_FALSE(exe.IsSourceImplementationFile()); } + +TEST(FileSpecTest, TestGetComponents) { + std::pair> PosixTests[] = { + {"/", {}}, + {"/foo", {"foo"}}, + {"/foo/", {"foo"}}, + {"/foo/bar", {"foo", "bar"}}, + {"/llvm-project/lldb/unittests/Utility/FileSpecTest.cpp", + {"llvm-project", "lldb", "unittests", "Utility", "FileSpecTest.cpp"}}, + }; + + for (const auto &pair : PosixTests) { + FileSpec file_spec = PosixSpec(pair.first); + EXPECT_EQ(file_spec.GetComponents(), pair.second); + } + + std::pair> WindowsTests[] = { + {"C:\\", {"C:"}}, + {"C:\\Windows\\", {"C:", "Windows"}}, + {"C:\\Windows\\System32", {"C:", "Windows", "System32"}}, + {"C:\\llvm-project\\lldb\\unittests\\Utility\\FileSpecTest.cpp", + {"C:", "llvm-project", "lldb", "unittests", "Utility", + "FileSpecTest.cpp"}}, + }; + + for (const auto &pair : WindowsTests) { + FileSpec file_spec = WindowsSpec(pair.first); + EXPECT_EQ(file_spec.GetComponents(), pair.second); + } +} From 510f4168cf919d0fff94138a43876cc3bb29ccae Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 30 May 2023 15:16:00 -0700 Subject: [PATCH 159/704] [RISCV] Add some tail undisturbed vwmacc tests. NFC These are additional tests for D151596. --- llvm/test/CodeGen/RISCV/rvv/vwmacc-vp.ll | 28 +++++++++++++ llvm/test/CodeGen/RISCV/rvv/vwmaccu-vp.ll | 28 +++++++++++++ llvm/test/CodeGen/RISCV/rvv/vwmaccus-vp.ll | 47 ++++++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/vwmacc-vp.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/vwmaccu-vp.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/vwmaccus-vp.ll diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwmacc-vp.ll new file mode 100644 index 0000000000000..f5cf4acd592c0 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vwmacc-vp.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -target-abi=ilp32 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+v -target-abi=lp64 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +declare @llvm.vp.sext.nxv1i32.nxv1i16(, , i32) +declare @llvm.vp.mul.nxv1i32(, , , i32) +declare @llvm.vp.add.nxv1i32(, , , i32) +declare @llvm.vp.merge.nxv1i32(, , , i32) + +define @vwmacc_vv_nxv1i32_unmasked_tu( %a, +; CHECK-LABEL: vwmacc_vv_nxv1i32_unmasked_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vwmacc.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %b, %c, i32 zeroext %evl) { + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %aext = call @llvm.vp.sext.nxv1i32.nxv1i16( %a, %allones, i32 %evl) + %bext = call @llvm.vp.sext.nxv1i32.nxv1i16( %b, %allones, i32 %evl) + %abmul = call @llvm.vp.mul.nxv1i32( %aext, %bext, %allones, i32 %evl) + %cadd = call @llvm.vp.add.nxv1i32( %abmul, %c, %allones, i32 %evl) + %ret = call @llvm.vp.merge.nxv1i32( %allones, %cadd, %c, i32 %evl) + ret %ret +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccu-vp.ll new file mode 100644 index 0000000000000..74dcb92b7cd65 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccu-vp.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -target-abi=ilp32 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+v -target-abi=lp64 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +declare @llvm.vp.zext.nxv1i32.nxv1i16(, , i32) +declare @llvm.vp.mul.nxv1i32(, , , i32) +declare @llvm.vp.add.nxv1i32(, , , i32) +declare @llvm.vp.merge.nxv1i32(, , , i32) + +define @vwmacc_vv_nxv1i32_unmasked_tu( %a, +; CHECK-LABEL: vwmacc_vv_nxv1i32_unmasked_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vwmaccu.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %b, %c, i32 zeroext %evl) { + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %aext = call @llvm.vp.zext.nxv1i32.nxv1i16( %a, %allones, i32 %evl) + %bext = call @llvm.vp.zext.nxv1i32.nxv1i16( %b, %allones, i32 %evl) + %abmul = call @llvm.vp.mul.nxv1i32( %aext, %bext, %allones, i32 %evl) + %cadd = call @llvm.vp.add.nxv1i32( %abmul, %c, %allones, i32 %evl) + %ret = call @llvm.vp.merge.nxv1i32( %allones, %cadd, %c, i32 %evl) + ret %ret +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vwmaccus-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwmaccus-vp.ll new file mode 100644 index 0000000000000..72ef25ee9c318 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vwmaccus-vp.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -target-abi=ilp32 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+v -target-abi=lp64 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +declare @llvm.vp.sext.nxv1i32.nxv1i16(, , i32) +declare @llvm.vp.zext.nxv1i32.nxv1i16(, , i32) +declare @llvm.vp.mul.nxv1i32(, , , i32) +declare @llvm.vp.add.nxv1i32(, , , i32) +declare @llvm.vp.merge.nxv1i32(, , , i32) + +define @vwmacc_vv_nxv1i32_unmasked_tu( %a, +; CHECK-LABEL: vwmacc_vv_nxv1i32_unmasked_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vwmaccsu.vv v10, v8, v9 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %b, %c, i32 zeroext %evl) { + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %aext = call @llvm.vp.sext.nxv1i32.nxv1i16( %a, %allones, i32 %evl) + %bext = call @llvm.vp.zext.nxv1i32.nxv1i16( %b, %allones, i32 %evl) + %abmul = call @llvm.vp.mul.nxv1i32( %aext, %bext, %allones, i32 %evl) + %cadd = call @llvm.vp.add.nxv1i32( %abmul, %c, %allones, i32 %evl) + %ret = call @llvm.vp.merge.nxv1i32( %allones, %cadd, %c, i32 %evl) + ret %ret +} + +define @vwmacc_vv_nxv1i32_commute_unmasked_tu( %a, +; CHECK-LABEL: vwmacc_vv_nxv1i32_commute_unmasked_tu: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma +; CHECK-NEXT: vwmaccsu.vv v10, v9, v8 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %b, %c, i32 zeroext %evl) { + %splat = insertelement poison, i1 -1, i32 0 + %allones = shufflevector %splat, poison, zeroinitializer + %aext = call @llvm.vp.zext.nxv1i32.nxv1i16( %a, %allones, i32 %evl) + %bext = call @llvm.vp.sext.nxv1i32.nxv1i16( %b, %allones, i32 %evl) + %abmul = call @llvm.vp.mul.nxv1i32( %aext, %bext, %allones, i32 %evl) + %cadd = call @llvm.vp.add.nxv1i32( %abmul, %c, %allones, i32 %evl) + %ret = call @llvm.vp.merge.nxv1i32( %allones, %cadd, %c, i32 %evl) + ret %ret +} From 76647fce136a362a30ee2434cce765dee9924d74 Mon Sep 17 00:00:00 2001 From: wren romano <2998727+wrengr@users.noreply.github.com> Date: Tue, 30 May 2023 13:16:29 -0700 Subject: [PATCH 160/704] [mlir][sparse] Combining `dimOrdering`+`higherOrdering` fields into `dimToLvl` This is a major step along the way towards the new STEA design. While a great deal of this patch is simple renaming, there are several significant changes as well. I've done my best to ensure that this patch retains the previous behavior and error-conditions, even though those are at odds with the eventual intended semantics of the `dimToLvl` mapping. Since the majority of the compiler does not yet support non-permutations, I've also added explicit assertions in places that previously had implicitly assumed it was dealing with permutations. Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D151505 --- mlir/include/mlir-c/Dialect/SparseTensor.h | 14 +- .../SparseTensor/IR/SparseTensorAttrDefs.td | 121 ++++++------ .../SparseTensor/IR/SparseTensorType.h | 73 ++++++-- .../Bindings/Python/DialectSparseTensor.cpp | 30 +-- mlir/lib/CAPI/Dialect/SparseTensor.cpp | 21 +-- .../SparseTensor/IR/SparseTensorDialect.cpp | 175 ++++++++++-------- .../SparseTensor/Transforms/LoopEmitter.cpp | 2 +- .../Transforms/SparseTensorCodegen.cpp | 25 ++- .../Transforms/SparseTensorConversion.cpp | 113 ++++++----- .../Transforms/SparseTensorRewriting.cpp | 67 +++---- .../Transforms/Sparsification.cpp | 13 +- mlir/test/CAPI/sparse_tensor.c | 15 +- mlir/test/Dialect/SparseTensor/codegen.mlir | 8 +- .../SparseTensor/codegen_sparse_dealloc.mlir | 2 +- .../test/Dialect/SparseTensor/conversion.mlir | 4 +- .../SparseTensor/convert_dense2sparse.mlir | 4 +- .../SparseTensor/convert_sparse2dense.mlir | 2 +- .../SparseTensor/convert_sparse2sparse.mlir | 2 +- .../SparseTensor/invalid_encoding.mlir | 24 +-- ..._shot_bufferize_tensor_copy_insertion.mlir | 2 +- .../SparseTensor/rewriting_for_codegen.mlir | 6 +- .../SparseTensor/roundtrip_encoding.mlir | 20 +- .../Dialect/SparseTensor/sparse_concat.mlir | 4 +- .../SparseTensor/sparse_concat_codegen.mlir | 4 +- .../Dialect/SparseTensor/sparse_expand.mlir | 4 +- .../SparseTensor/sparse_lower_col.mlir | 10 +- .../SparseTensor/sparse_matmul_codegen.mlir | 2 +- .../test/Dialect/SparseTensor/sparse_out.mlir | 4 +- .../Dialect/SparseTensor/sparse_perm.mlir | 2 +- .../SparseTensor/sparse_perm_lower.mlir | 2 +- .../SparseTensor/sparse_transpose.mlir | 14 +- .../SparseTensor/sparse_vector_concat.mlir | 4 +- .../SparseTensor/CPU/concatenate_dim_0.mlir | 8 +- .../CPU/concatenate_dim_0_permute.mlir | 8 +- .../SparseTensor/CPU/concatenate_dim_1.mlir | 8 +- .../CPU/concatenate_dim_1_permute.mlir | 8 +- .../SparseTensor/CPU/dense_output.mlir | 4 +- .../CPU/sparse_codegen_foreach.mlir | 6 +- .../CPU/sparse_conv_1d_nwc_wcf.mlir | 2 +- .../SparseTensor/CPU/sparse_conv_2d.mlir | 2 +- .../SparseTensor/CPU/sparse_conversion.mlir | 6 +- .../CPU/sparse_conversion_dyn.mlir | 2 +- .../CPU/sparse_conversion_element.mlir | 2 +- .../CPU/sparse_conversion_ptr.mlir | 4 +- .../CPU/sparse_conversion_sparse2dense.mlir | 12 +- .../CPU/sparse_conversion_sparse2sparse.mlir | 2 +- .../SparseTensor/CPU/sparse_expand.mlir | 2 +- .../SparseTensor/CPU/sparse_flatten.mlir | 4 +- .../SparseTensor/CPU/sparse_matmul.mlir | 4 +- .../SparseTensor/CPU/sparse_out_simple.mlir | 2 +- .../CPU/sparse_reduce_custom.mlir | 2 +- .../CPU/sparse_reduce_custom_prod.mlir | 2 +- .../SparseTensor/CPU/sparse_select.mlir | 2 +- .../SparseTensor/CPU/sparse_sorted_coo.mlir | 4 +- .../SparseTensor/CPU/sparse_storage.mlir | 6 +- .../SparseTensor/CPU/sparse_transpose.mlir | 2 +- .../Dialect/SparseTensor/python/test_SDDMM.py | 2 +- .../Dialect/SparseTensor/python/test_SpMM.py | 2 +- .../SparseTensor/python/test_output.py | 2 +- .../SparseTensor/python/test_stress.py | 2 +- .../SparseTensor/taco/tools/mlir_pytaco.py | 1 - .../python/dialects/sparse_tensor/dialect.py | 18 +- 62 files changed, 484 insertions(+), 440 deletions(-) diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h index 1ff6dc1b8dd54..0ad1a315e4c14 100644 --- a/mlir/include/mlir-c/Dialect/SparseTensor.h +++ b/mlir/include/mlir-c/Dialect/SparseTensor.h @@ -52,9 +52,8 @@ mlirAttributeIsASparseTensorEncodingAttr(MlirAttribute attr); /// Creates a `sparse_tensor.encoding` attribute with the given parameters. MLIR_CAPI_EXPORTED MlirAttribute mlirSparseTensorEncodingAttrGet( MlirContext ctx, intptr_t lvlRank, - enum MlirSparseTensorDimLevelType const *lvlTypes, - MlirAffineMap dimOrdering, MlirAffineMap higherOrdering, int posWidth, - int crdWidth); + enum MlirSparseTensorDimLevelType const *lvlTypes, MlirAffineMap dimToLvl, + int posWidth, int crdWidth); /// Returns the level-rank of the `sparse_tensor.encoding` attribute. MLIR_CAPI_EXPORTED intptr_t @@ -64,13 +63,10 @@ mlirSparseTensorEncodingGetLvlRank(MlirAttribute attr); MLIR_CAPI_EXPORTED enum MlirSparseTensorDimLevelType mlirSparseTensorEncodingAttrGetLvlType(MlirAttribute attr, intptr_t lvl); -/// Returns the dimension-ordering of the `sparse_tensor.encoding` attribute. +/// Returns the dimension-to-level mapping of the `sparse_tensor.encoding` +/// attribute. MLIR_CAPI_EXPORTED MlirAffineMap -mlirSparseTensorEncodingAttrGetDimOrdering(MlirAttribute attr); - -/// Returns the higher-ordering of the `sparse_tensor.encoding` attribute. -MLIR_CAPI_EXPORTED MlirAffineMap -mlirSparseTensorEncodingAttrGetHigherOrdering(MlirAttribute attr); +mlirSparseTensorEncodingAttrGetDimToLvl(MlirAttribute attr); /// Returns the position bitwidth of the `sparse_tensor.encoding` attribute. MLIR_CAPI_EXPORTED int diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td index adfdc48014902..e49d7be36620c 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td @@ -125,6 +125,22 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", passes that run before this sparse compiler pass need to be aware of the semantics of tensor types with such an encoding. + Each sparse tensor comes equipped with two different sets of axes for + describing the tensor's multi-dimensional structure. We use the term + "dimension" to refer to the axes of the semantic tensor itself; whereas, + we use the term "level" to refer to the axes of the storage scheme, + which is the operational representation of that tensor. Therefore, + the fields of the encoding attribute (further explained below) satisfy + the following correspondences: + + - Dimensions: + - the shape of the tensor type + - the `dimSlices` field + - the arguments of the `dimToLvl` field + - Levels: + - the results of the `dimToLvl` field + - the `lvlTypes` field + The attribute consists of the following fields. - Level-type for each level of a tensor type: @@ -144,30 +160,13 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", properties, and split up how the level-format and properties are specified rather than using this suffix mechanism. - - An optional permutation which maps (higher-ordering)-coordinates - to level-coordinates; defaulting to the identity permutation. - For example, given a 2-d tensor with the default higher-ordering, - `(i, j) -> (i, j)` specifies row-wise storage and `(i, j) -> - (j, i)` specifies column-wise storage. - - TODO: this field is called "dimOrdering" for historical reasons, - even though it actually operates on level-coordinates rather than - dimension-coordinates. - (This will be corrected in an upcoming change that completely - overhauls the syntax of this attribute.) - - - An optional higher-order mapping from dimension-coordinates to - a higher-order coordinate space; defaulting to the identity map. - This is applied before the `dimOrdering`, thus we have the composite: - dimCoords --higherOrdering--> hoCoords --dimOrdering--> lvlCoords. - The higher-order mapping is used to define block-sparse storage, - jagged-diagonal (JDS/ELL/ITPACK) storage, etc. - - For example, given a 2-d tensor, the mapping + - An optional affine map from dimension-coordinates to level-coordinates; + defaulting to the identity map. For example, given a 2-d tensor: + `(i, j) -> (i, j)` specifies row-wise storage, `(i, j) -> (j, i)` + specifies column-wise storage, and `(i, j) -> (i floordiv 2, j floordiv 3, i mod 2, j mod 3)` - imposes an higher-order partitioning into 2x3 blocks along the - matrix layout. For block-sparsity, blocks are typically stored - with compression while dense storage is used within each block + specifies 2x3 block-sparsity. For block-sparsity, blocks are typically + stored with compression while dense storage is used within each block (although hybrid schemes are possible as well). TODO: the following example is out-of-date and will be implemented @@ -175,7 +174,7 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", (This will be corrected in an upcoming change that completely overhauls the syntax of this attribute.) - The higher-order mapping also provides a notion of "counting a + The dimToLvl mapping also provides a notion of "counting a dimension", where every stored element with the same coordinate is mapped to a new slice. For instance, ELL storage of a 2-d tensor can be defined with the mapping `(i, j) -> (#i, i, j)` @@ -221,7 +220,7 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", // Doubly compressed sparse column storage with specific bitwidths. #DCSC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i, j) -> (j, i)>, + dimToLvl = affine_map<(i, j) -> (j, i)>, posWidth = 32, crdWidth = 8 }> @@ -230,16 +229,14 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", // Block sparse row storage (2x3 blocks). #BCSR = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "dense", "dense" ], - dimOrdering = affine_map<(ii, jj, i, j) -> (ii, jj, i, j)>, - higherOrdering = affine_map<(i, j) -> (i floordiv 2, j floordiv 3, i mod 2, j mod 3)> + dimToLvl = affine_map<(i, j) -> (i floordiv 2, j floordiv 3, i mod 2, j mod 3)> }> ... tensor<20x30xf32, #BCSR> ... // ELL storage (4 jagged diagonals, i.e., at most 4 nonzeros per row). #ELL = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense", "compressed" ], - dimOrdering = affine_map<(ii, i, j) -> (ii, i, j)>, - higherOrdering = affine_map<(i, j)[c] -> (c * 4 * i, i, j)> + dimToLvl = affine_map<(i, j)[c] -> (c * 4 * i, i, j)> }> ... tensor ... @@ -262,15 +259,16 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", "::mlir::sparse_tensor::DimLevelType", "level-types" >: $lvlTypes, - // A permutation from (higher-ordering)-coordinates to level-coordinates. - "AffineMap":$dimOrdering, - // A mapping from dimension-coordinates to (higher-ordering)-coordinates. - "AffineMap":$higherOrdering, + // A mapping from dimension-coordinates to level-coordinates. + "AffineMap":$dimToLvl, // The required bitwidth for position storage. "unsigned":$posWidth, // The required bitwidth for coordinate storage. "unsigned":$crdWidth, // A slice attribute for each dimension of the tensor type. + // FIXME: The name used here is `dimSlices`, however the + // parser/printer uses the name `slice` instead. Therefore + // the parser/printer need to be updated to match. ArrayRefParameter< "::mlir::sparse_tensor::SparseTensorDimSliceAttr", "per dimension slice metadata" @@ -279,16 +277,11 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", let builders = [ AttrBuilder<(ins "ArrayRef<::mlir::sparse_tensor::DimLevelType>":$lvlTypes, - "AffineMap":$dimOrdering, - "AffineMap":$higherOrdering, + "AffineMap":$dimToLvl, "unsigned":$posWidth, "unsigned":$crdWidth), [{ - return $_get($_ctxt, lvlTypes, - dimOrdering, - higherOrdering, - posWidth, - crdWidth, - ArrayRef<::mlir::sparse_tensor::SparseTensorDimSliceAttr>{}); + return $_get($_ctxt, lvlTypes, dimToLvl, posWidth, crdWidth, + ArrayRef<::mlir::sparse_tensor::SparseTensorDimSliceAttr>{}); }]> ]; @@ -297,23 +290,40 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", // Factory methods. // - /// Constructs a new encoding with the dimOrdering and higherOrdering - /// reset to the default/identity. - SparseTensorEncodingAttr withoutOrdering() const; + /// Constructs a new encoding with the given dimToLvl mapping, + /// and all other fields inherited from `this`. + SparseTensorEncodingAttr withDimToLvl(AffineMap dimToLvl) const; + SparseTensorEncodingAttr withDimToLvl(SparseTensorEncodingAttr enc) const; + + /// Constructs a new encoding with dimToLvl reset to the default/identity, + /// and all other fields inherited from `this`. + SparseTensorEncodingAttr withoutDimToLvl() const; + + /// Constructs a new encoding with the given pointer and index + /// bitwidths, and all other fields inherited from `this`. + SparseTensorEncodingAttr withBitWidths(unsigned posWidth, unsigned crdWidth) const; - /// Constructs a new encoding with the pointer and index bitwidth - /// reset to the default. + /// Constructs a new encoding with the pointer and index bitwidths + /// reset to the default, and all other fields inherited from `this`. SparseTensorEncodingAttr withoutBitWidths() const; // - // lvlTypes methods. + // Rank methods. // + /// Returns the expected number of tensor dimensions. Asserts that + /// the encoding is non-null (since no fixed result is valid for every + /// dense-tensor). + ::mlir::sparse_tensor::Dimension getDimRank() const; + /// Returns the number of storage levels. Asserts that the encoding - /// is non-null (since there is no fixed result that's valid for - /// every dense-tensor). + /// is non-null (since no fixed result is valid for every dense-tensor). ::mlir::sparse_tensor::Level getLvlRank() const; + // + // lvlTypes methods. + // + /// Safely looks up the level-type for the requested level. (Returns /// `DimLevelType::Dense` for the null encoding, since dense-tensors /// are always all-dense.) @@ -335,13 +345,18 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", bool isAllOrdered() const; // - // dimOrdering/higherOrdering methods. + // dimToLvl methods. // - /// Returns true if the encoding has an identity dimension ordering. + /// Returns true if the dimToLvl mapping is the identity. + /// Also returns true for the null encoding (since dense-tensors + /// always have the identity mapping). + bool isIdentity() const; + + /// Returns true if the dimToLvl mapping is a permutation. /// Also returns true for the null encoding (since dense-tensors - /// always have the identity ordering). - bool hasIdDimOrdering() const; + /// always have the identity mapping). + bool isPermutation() const; // // posWidth/crdWidth methods. diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h index 4c4f1f25edfd5..6cae09db36cc1 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h @@ -45,12 +45,12 @@ namespace sparse_tensor { /// class SparseTensorType { public: - // We memoize `lvlRank` and `dim2lvl` to avoid repeating the + // We memoize `lvlRank` and `dimToLvl` to avoid repeating the // conditionals throughout the rest of the class. SparseTensorType(RankedTensorType rtp) : rtp(rtp), enc(getSparseTensorEncoding(rtp)), lvlRank(enc ? enc.getLvlRank() : getDimRank()), - dim2lvl(enc.hasIdDimOrdering() ? AffineMap() : enc.getDimOrdering()) { + dimToLvl(enc.isIdentity() ? AffineMap() : enc.getDimToLvl()) { assert(rtp && "got null RankedTensorType"); assert((!isIdentity() || getDimRank() == lvlRank) && "Rank mismatch"); } @@ -65,6 +65,10 @@ class SparseTensorType { // So we must explicitly define the copy-ctor to silence -Wdeprecated-copy. SparseTensorType(const SparseTensorType &) = default; + // + // Factory methods. + // + /// Constructs a new `SparseTensorType` with the same dimension-shape /// and element type, but with the encoding replaced by the given encoding. SparseTensorType withEncoding(SparseTensorEncodingAttr newEnc) const { @@ -73,11 +77,44 @@ class SparseTensorType { /// Constructs a new `SparseTensorType` with the same dimension-shape /// and element type, but with the encoding replaced by - /// `getEncoding().withoutOrdering()`. - SparseTensorType withoutOrdering() const { - return withEncoding(enc.withoutOrdering()); + /// `getEncoding().withDimToLvl(dimToLvl)`. + SparseTensorType withDimToLvl(AffineMap dimToLvl) const { + return withEncoding(enc.withDimToLvl(dimToLvl)); + } + + SparseTensorType withDimToLvl(SparseTensorEncodingAttr dimToLvlEnc) const { + return withEncoding(enc.withDimToLvl(dimToLvlEnc)); + } + + SparseTensorType withDimToLvl(const SparseTensorType &dimToLvlSTT) const { + return withDimToLvl(dimToLvlSTT.getEncoding()); + } + + /// Constructs a new `SparseTensorType` with the same dimension-shape + /// and element type, but with the encoding replaced by + /// `getEncoding().withoutDimToLvl()`. + SparseTensorType withoutDimToLvl() const { + return withEncoding(enc.withoutDimToLvl()); + } + + /// Constructs a new `SparseTensorType` with the same dimension-shape + /// and element type, but with the encoding replaced by + /// `getEncoding().withBitWidths(posWidth, crdWidth)`. + SparseTensorType withBitWidths(unsigned posWidth, unsigned crdWidth) const { + return withEncoding(enc.withBitWidths(posWidth, crdWidth)); + } + + /// Constructs a new `SparseTensorType` with the same dimension-shape + /// and element type, but with the encoding replaced by + /// `getEncoding().withoutBitWidths()`. + SparseTensorType withoutBitWidths() const { + return withEncoding(enc.withoutBitWidths()); } + // + // Other methods. + // + /// Allow implicit conversion to `RankedTensorType`, `ShapedType`, /// and `Type`. These are implicit to help alleviate the impedance /// mismatch for code that has not been converted to use `SparseTensorType` @@ -144,32 +181,36 @@ class SparseTensorType { /// Returns true if the dimToLvl mapping is the identity. /// (This is always true for dense-tensors.) - bool isIdentity() const { return !dim2lvl; } + bool isIdentity() const { return !dimToLvl; } + + /// Returns true if the dimToLvl mapping is a permutation. + /// (This is always true for dense-tensors.) + bool isPermutation() const { return enc.isPermutation(); } /// Returns the dimToLvl mapping (or the null-map for the identity). /// If you intend to compare the results of this method for equality, - /// see `hasSameDimToLvlMap` instead. - AffineMap getDimToLvlMap() const { return dim2lvl; } + /// see `hasSameDimToLvl` instead. + AffineMap getDimToLvl() const { return dimToLvl; } /// Returns the dimToLvl mapping, where the identity map is expanded out /// into a full `AffineMap`. This method is provided as a convenience, - /// but for most purposes other methods (`isIdentity`, `getDimToLvlMap`, + /// but for most purposes other methods (`isIdentity`, `getDimToLvl`, /// etc) will be more helpful. - AffineMap getExpandedDimToLvlMap() const { - return dim2lvl - ? dim2lvl + AffineMap getExpandedDimToLvl() const { + return dimToLvl + ? dimToLvl : AffineMap::getMultiDimIdentityMap(getDimRank(), getContext()); } /// Returns true iff the two types have the same mapping. This method /// takes care to handle identity maps properly, so it should be preferred - /// over using `getDimToLvlMap` followed by `AffineMap::operator==`. - bool hasSameDimToLvlMap(const SparseTensorType &other) const { + /// over using `getDimToLvl` followed by `AffineMap::operator==`. + bool hasSameDimToLvl(const SparseTensorType &other) const { // If the maps are the identity, then we need to check the rank // to be sure they're the same size identity. (And since identity // means dimRank==lvlRank, we use lvlRank as a minor optimization.) return isIdentity() ? (other.isIdentity() && lvlRank == other.lvlRank) - : (dim2lvl == other.dim2lvl); + : (dimToLvl == other.dimToLvl); } /// Returns the dimension-rank. @@ -255,7 +296,7 @@ class SparseTensorType { const SparseTensorEncodingAttr enc; // Memoized to avoid frequent redundant conditionals. const Level lvlRank; - const AffineMap dim2lvl; + const AffineMap dimToLvl; }; /// Convenience method to abbreviate wrapping `getRankedTensorType`. diff --git a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp index 0f0e676041b2f..2e8d535455a34 100644 --- a/mlir/lib/Bindings/Python/DialectSparseTensor.cpp +++ b/mlir/lib/Bindings/Python/DialectSparseTensor.cpp @@ -40,18 +40,16 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) { .def_classmethod( "get", [](py::object cls, std::vector lvlTypes, - std::optional dimOrdering, - std::optional higherOrdering, int posWidth, - int crdWidth, MlirContext context) { + std::optional dimToLvl, int posWidth, int crdWidth, + MlirContext context) { return cls(mlirSparseTensorEncodingAttrGet( context, lvlTypes.size(), lvlTypes.data(), - dimOrdering ? *dimOrdering : MlirAffineMap{nullptr}, - higherOrdering ? *higherOrdering : MlirAffineMap{nullptr}, - posWidth, crdWidth)); + dimToLvl ? *dimToLvl : MlirAffineMap{nullptr}, posWidth, + crdWidth)); }, - py::arg("cls"), py::arg("lvl_types"), py::arg("dim_ordering"), - py::arg("higher_ordering"), py::arg("pos_width"), - py::arg("crd_width"), py::arg("context") = py::none(), + py::arg("cls"), py::arg("lvl_types"), py::arg("dim_to_lvl"), + py::arg("pos_width"), py::arg("crd_width"), + py::arg("context") = py::none(), "Gets a sparse_tensor.encoding from parameters.") .def_property_readonly( "lvl_types", @@ -64,19 +62,9 @@ static void populateDialectSparseTensorSubmodule(const py::module &m) { return ret; }) .def_property_readonly( - "dim_ordering", + "dim_to_lvl", [](MlirAttribute self) -> std::optional { - MlirAffineMap ret = - mlirSparseTensorEncodingAttrGetDimOrdering(self); - if (mlirAffineMapIsNull(ret)) - return {}; - return ret; - }) - .def_property_readonly( - "higher_ordering", - [](MlirAttribute self) -> std::optional { - MlirAffineMap ret = - mlirSparseTensorEncodingAttrGetHigherOrdering(self); + MlirAffineMap ret = mlirSparseTensorEncodingAttrGetDimToLvl(self); if (mlirAffineMapIsNull(ret)) return {}; return ret; diff --git a/mlir/lib/CAPI/Dialect/SparseTensor.cpp b/mlir/lib/CAPI/Dialect/SparseTensor.cpp index 8569acf436138..e18da1027e0f3 100644 --- a/mlir/lib/CAPI/Dialect/SparseTensor.cpp +++ b/mlir/lib/CAPI/Dialect/SparseTensor.cpp @@ -45,26 +45,21 @@ bool mlirAttributeIsASparseTensorEncodingAttr(MlirAttribute attr) { return isa(unwrap(attr)); } -MlirAttribute mlirSparseTensorEncodingAttrGet( - MlirContext ctx, intptr_t lvlRank, - MlirSparseTensorDimLevelType const *lvlTypes, MlirAffineMap dimOrdering, - MlirAffineMap higherOrdering, int posWidth, int crdWidth) { +MlirAttribute +mlirSparseTensorEncodingAttrGet(MlirContext ctx, intptr_t lvlRank, + MlirSparseTensorDimLevelType const *lvlTypes, + MlirAffineMap dimToLvl, int posWidth, + int crdWidth) { SmallVector cppLvlTypes; cppLvlTypes.reserve(lvlRank); for (intptr_t l = 0; l < lvlRank; ++l) cppLvlTypes.push_back(static_cast(lvlTypes[l])); return wrap(SparseTensorEncodingAttr::get( - unwrap(ctx), cppLvlTypes, unwrap(dimOrdering), unwrap(higherOrdering), - posWidth, crdWidth)); + unwrap(ctx), cppLvlTypes, unwrap(dimToLvl), posWidth, crdWidth)); } -MlirAffineMap mlirSparseTensorEncodingAttrGetDimOrdering(MlirAttribute attr) { - return wrap(cast(unwrap(attr)).getDimOrdering()); -} - -MlirAffineMap -mlirSparseTensorEncodingAttrGetHigherOrdering(MlirAttribute attr) { - return wrap(cast(unwrap(attr)).getHigherOrdering()); +MlirAffineMap mlirSparseTensorEncodingAttrGetDimToLvl(MlirAttribute attr) { + return wrap(cast(unwrap(attr)).getDimToLvl()); } intptr_t mlirSparseTensorEncodingGetLvlRank(MlirAttribute attr) { diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 3175e957698d0..ae4198f5dce69 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -263,15 +263,32 @@ Type SparseTensorEncodingAttr::getCrdType() const { return detail::getIntegerOrIndexType(getContext(), getCrdWidth()); } -SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutOrdering() const { - return SparseTensorEncodingAttr::get(getContext(), getLvlTypes(), AffineMap(), - AffineMap(), getPosWidth(), - getCrdWidth()); +SparseTensorEncodingAttr +SparseTensorEncodingAttr::withDimToLvl(AffineMap dimToLvl) const { + assert(getImpl() && "Uninitialized SparseTensorEncodingAttr"); + return SparseTensorEncodingAttr::get(getContext(), getLvlTypes(), dimToLvl, + getPosWidth(), getCrdWidth()); +} + +SparseTensorEncodingAttr +SparseTensorEncodingAttr::withDimToLvl(SparseTensorEncodingAttr enc) const { + return withDimToLvl(enc ? enc.getDimToLvl() : AffineMap()); +} + +SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutDimToLvl() const { + return withDimToLvl(AffineMap()); +} + +SparseTensorEncodingAttr +SparseTensorEncodingAttr::withBitWidths(unsigned posWidth, + unsigned crdWidth) const { + assert(getImpl() && "Uninitialized SparseTensorEncodingAttr"); + return SparseTensorEncodingAttr::get(getContext(), getLvlTypes(), + getDimToLvl(), posWidth, crdWidth); } SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutBitWidths() const { - return SparseTensorEncodingAttr::get( - getContext(), getLvlTypes(), getDimOrdering(), getHigherOrdering(), 0, 0); + return withBitWidths(0, 0); } bool SparseTensorEncodingAttr::isAllDense() const { @@ -282,8 +299,18 @@ bool SparseTensorEncodingAttr::isAllOrdered() const { return !getImpl() || llvm::all_of(getLvlTypes(), isOrderedDLT); } -bool SparseTensorEncodingAttr::hasIdDimOrdering() const { - return !getImpl() || !getDimOrdering() || getDimOrdering().isIdentity(); +bool SparseTensorEncodingAttr::isIdentity() const { + return !getImpl() || !getDimToLvl() || getDimToLvl().isIdentity(); +} + +bool SparseTensorEncodingAttr::isPermutation() const { + return !getImpl() || !getDimToLvl() || getDimToLvl().isPermutation(); +} + +Dimension SparseTensorEncodingAttr::getDimRank() const { + assert(getImpl() && "Uninitialized SparseTensorEncodingAttr"); + const auto dimToLvl = getDimToLvl(); + return dimToLvl ? dimToLvl.getNumDims() : getLvlRank(); } Level SparseTensorEncodingAttr::getLvlRank() const { @@ -382,15 +409,14 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) { // Process the data from the parsed dictionary value into struct-like data. SmallVector lvlTypes; SmallVector slices; - AffineMap dimOrd = {}; - AffineMap higherOrd = {}; + AffineMap dimToLvl = {}; unsigned posWidth = 0; unsigned crdWidth = 0; StringRef attrName; // Exactly 6 keys. - SmallVector keys = {"lvlTypes", "dimOrdering", "higherOrdering", - "posWidth", "crdWidth", "slice"}; + SmallVector keys = {"lvlTypes", "dimToLvl", "posWidth", + "crdWidth", "slice"}; while (succeeded(parser.parseOptionalKeyword(&attrName))) { if (!llvm::is_contained(keys, attrName)) { parser.emitError(parser.getNameLoc(), "unexpected key: ") << attrName; @@ -420,18 +446,12 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) { return {}; } } - } else if (attrName == "dimOrdering") { - Attribute attr; - RETURN_ON_FAIL(parser.parseAttribute(attr)) - auto affineAttr = llvm::dyn_cast(attr); - ERROR_IF(!affineAttr, "expected an affine map for dimension ordering") - dimOrd = affineAttr.getValue(); - } else if (attrName == "higherOrdering") { + } else if (attrName == "dimToLvl") { Attribute attr; RETURN_ON_FAIL(parser.parseAttribute(attr)) auto affineAttr = llvm::dyn_cast(attr); - ERROR_IF(!affineAttr, "expected an affine map for higher ordering") - higherOrd = affineAttr.getValue(); + ERROR_IF(!affineAttr, "expected an affine map for dimToLvl") + dimToLvl = affineAttr.getValue(); } else if (attrName == "posWidth") { Attribute attr; RETURN_ON_FAIL(parser.parseAttribute(attr)) @@ -474,8 +494,7 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) { // Construct struct-like storage for attribute. return parser.getChecked( - parser.getContext(), lvlTypes, dimOrd, higherOrd, posWidth, crdWidth, - slices); + parser.getContext(), lvlTypes, dimToLvl, posWidth, crdWidth, slices); } void SparseTensorEncodingAttr::print(AsmPrinter &printer) const { @@ -486,10 +505,8 @@ void SparseTensorEncodingAttr::print(AsmPrinter &printer) const { }); printer << " ]"; // Print remaining members only for non-default values. - if (!hasIdDimOrdering()) - printer << ", dimOrdering = affine_map<" << getDimOrdering() << ">"; - if (getHigherOrdering()) - printer << ", higherOrdering = affine_map<" << getHigherOrdering() << ">"; + if (!isIdentity()) + printer << ", dimToLvl = affine_map<" << getDimToLvl() << ">"; if (getPosWidth()) printer << ", posWidth = " << getPosWidth(); if (getCrdWidth()) @@ -510,9 +527,8 @@ void SparseTensorEncodingAttr::print(AsmPrinter &printer) const { LogicalResult SparseTensorEncodingAttr::verify( function_ref emitError, - ArrayRef lvlTypes, AffineMap dimOrdering, - AffineMap higherOrdering, unsigned posWidth, unsigned crdWidth, - ArrayRef dimSlices) { + ArrayRef lvlTypes, AffineMap dimToLvl, unsigned posWidth, + unsigned crdWidth, ArrayRef dimSlices) { if (!acceptBitWidth(posWidth)) return emitError() << "unexpected position bitwidth: " << posWidth; if (!acceptBitWidth(crdWidth)) @@ -525,25 +541,41 @@ LogicalResult SparseTensorEncodingAttr::verify( const Level lvlRank = lvlTypes.size(); if (lvlRank == 0) return emitError() << "expected a non-empty array for lvlTypes"; - if (dimOrdering) { - if (!dimOrdering.isPermutation()) - return emitError() - << "expected a permutation affine map for dimension ordering"; - if (dimOrdering.getNumResults() != lvlRank) + // We save `dimRank` here because we'll also need it to verify `dimSlices`. + const Dimension dimRank = dimToLvl ? dimToLvl.getNumDims() : lvlRank; + if (dimToLvl) { + if (dimToLvl.getNumResults() != lvlRank) return emitError() - << "level-rank mismatch between dimOrdering and lvlTypes"; + << "level-rank mismatch between dimToLvl and lvlTypes: " + << dimToLvl.getNumResults() << " != " << lvlRank; + // TODO: The following is attempting to match the old error-conditions + // from prior to merging dimOrdering and higherOrdering into dimToLvl. + // That is, we currently require `dimToLvl` to be either a permutation + // (as when higherOrdering is the identity) or expansive (as per the + // constraints on higherOrdering). However, those constraints do + // not match the intended semantics of `dimToLvl`. As we improve the + // compiler to actually handle non-permutations, we need to update these + // checks to match what is actually supported. In particular, this is + // where we'll have to check that when `lvlToDim` is provided then it + // is indeed an inverse of `dimToLvl`, and when it isn't provided then + // it can be automatically inferred. + if (dimRank == lvlRank && !dimToLvl.isPermutation()) + return emitError() << "expected a permutation affine map for dimToLvl"; + if (dimRank > lvlRank) + return emitError() << "unexpected dimToLvl mapping from " << dimRank + << " to " << lvlRank; } - if (higherOrdering) { - if (higherOrdering.getNumDims() >= higherOrdering.getNumResults()) - return emitError() << "unexpected higher ordering mapping from " - << higherOrdering.getNumDims() << " to " - << higherOrdering.getNumResults(); - if (higherOrdering.getNumResults() != lvlRank) + if (!dimSlices.empty()) { + if (dimSlices.size() != dimRank) return emitError() - << "level-rank mismatch between higherOrdering and lvlTypes"; - } - if (!dimSlices.empty() && dimSlices.size() != lvlRank) { - return emitError() << "level-rank mismatch between dimSlices and lvlTypes"; + << "dimension-rank mismatch between dimSlices and dimToLvl: " + << dimSlices.size() << " != " << dimRank; + // Compiler support for `dimSlices` currently requires that the two + // ranks agree. (However, it does allow `dimToLvl` to be a permutation.) + if (dimRank != lvlRank) + return emitError() + << "dimSlices expected dimension-rank to match level-rank: " + << dimRank << " != " << lvlRank; } return success(); } @@ -558,24 +590,18 @@ LogicalResult SparseTensorEncodingAttr::verifyEncoding( function_ref emitError) const { // Check structural integrity. In particular, this ensures that the // level-rank is coherent across all the fields. - RETURN_FAILURE_IF_FAILED(verify(emitError, getLvlTypes(), getDimOrdering(), - getHigherOrdering(), getPosWidth(), - getCrdWidth(), getDimSlices())) + RETURN_FAILURE_IF_FAILED(verify(emitError, getLvlTypes(), getDimToLvl(), + getPosWidth(), getCrdWidth(), getDimSlices())) // Check integrity with tensor type specifics. In particular, we // need only check that the dimension-rank of the tensor agrees with // the dimension-rank of the encoding. const Dimension dimRank = dimShape.size(); if (dimRank == 0) return emitError() << "expected non-scalar sparse tensor"; - if (const auto higherOrdering = getHigherOrdering()) { - if (higherOrdering.getNumDims() != dimRank) - return emitError() << "expected an affine map with " << dimRank - << " dimensions for higher ordering"; - // TODO: verification of higher ordering contents - } else if (dimRank != getLvlRank()) { - return emitError() << "expected an array of size " << dimRank - << " for lvlTypes"; - } + if (getDimRank() != dimRank) + return emitError() + << "dimension-rank mismatch between encoding and tensor shape: " + << getDimRank() << " != " << dimRank; return success(); } @@ -627,14 +653,14 @@ RankedTensorType sparse_tensor::getCOOFromTypeWithOrdering(RankedTensorType rtt, AffineMap lvlPerm, bool ordered) { const SparseTensorType src(rtt); - // The dim-rank of the source `RankedTensorType` is used as the lvl-rank - // of the result `RankedTensorType`. This follows from the fact that the - // result's encoding has the default higher-ordering (hence the result's - // lvl-rank equals its dim-rank). We don't need to assert that `lvlRank` - // agrees with the size of `lvlPerm` because that will be verified by - // `STEA::get`. - const Level lvlRank = src.getDimRank(); + // TODO: This assertion is to match the behavior from before we merged + // dimOrdering and higherOrdering into dimToLvl. However, there's no + // in-principle reason to require this. (wrengr has a commit in the + // wings to fix this.) + assert(src.isPermutation()); + const Level lvlRank = src.getLvlRank(); SmallVector lvlTypes; + lvlTypes.reserve(lvlRank); // An unordered and non-unique compressed level at beginning. // If this is also the last level, then it is unique. @@ -655,7 +681,7 @@ RankedTensorType sparse_tensor::getCOOFromTypeWithOrdering(RankedTensorType rtt, unsigned posWidth = src.getPosWidth(); unsigned crdWidth = src.getCrdWidth(); auto enc = SparseTensorEncodingAttr::get(src.getContext(), lvlTypes, lvlPerm, - AffineMap(), posWidth, crdWidth); + posWidth, crdWidth); return RankedTensorType::get(src.getDimShape(), src.getElementType(), enc); } @@ -671,10 +697,9 @@ RankedTensorType sparse_tensor::getCOOFromType(RankedTensorType src, Dimension mlir::sparse_tensor::toOrigDim(SparseTensorEncodingAttr enc, Level l) { if (enc) { - auto order = enc.getDimOrdering(); - if (order) { - assert(order.isPermutation()); - return order.getDimPosition(l); + if (const auto dimToLvl = enc.getDimToLvl()) { + assert(enc.isPermutation()); + return dimToLvl.getDimPosition(l); } } return l; @@ -685,11 +710,10 @@ Dimension mlir::sparse_tensor::toOrigDim(SparseTensorEncodingAttr enc, Level mlir::sparse_tensor::toStoredDim(SparseTensorEncodingAttr enc, Dimension d) { if (enc) { - auto order = enc.getDimOrdering(); - if (order) { - assert(order.isPermutation()); + if (const auto dimToLvl = enc.getDimToLvl()) { + assert(enc.isPermutation()); auto maybePos = - order.getResultPosition(getAffineDimExpr(d, enc.getContext())); + dimToLvl.getResultPosition(getAffineDimExpr(d, enc.getContext())); assert(maybePos.has_value()); return *maybePos; } @@ -728,8 +752,7 @@ getNormalizedEncodingForSpecifier(SparseTensorEncodingAttr enc) { return SparseTensorEncodingAttr::get( enc.getContext(), dlts, - AffineMap(), // dimOrdering (irrelavant to storage speicifer) - AffineMap(), // highLvlOrdering (irrelavant to storage specifer) + AffineMap(), // dimToLvl (irrelevant to storage specifier) // Always use `index` for memSize and lvlSize instead of reusing // `getPosWidth` and `getCrdWidth`. It allows us to reuse the same SSA // value for different bitwidth, it also avoids casting between index and diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp index a50e337def72d..182ae45d6cc1a 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp @@ -385,7 +385,7 @@ void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc, // FIXME: `toOrigDim` is deprecated. For now this relies on the // 1:1 mapping between levels and dimensions, since nowhere else - // in the code supports HigherOrdering yet either. + // in the code supports non-permutations yet either. Value lvlSz = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, toOrigDim(enc, l)); // Find upper bound in current dimension. diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp index 20d0c5e7d4f1b..f84009c4b63bd 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp @@ -481,7 +481,7 @@ class SparseInsertGenerator nameOstream << sh << "_"; // Permutation information is also used in generating insertion. if (!stt.isIdentity()) - nameOstream << stt.getDimToLvlMap() << "_"; + nameOstream << stt.getDimToLvl() << "_"; nameOstream << stt.getElementType() << "_"; nameOstream << stt.getCrdWidth() << "_" << stt.getPosWidth(); return nameOstream.str().str(); @@ -1139,8 +1139,7 @@ class SparseExtractSliceConverter if (!srcEnc || !dstEnc || !dstEnc.isSlice()) return failure(); assert(srcEnc.getLvlTypes() == dstEnc.getLvlTypes()); - assert(srcEnc.getDimOrdering() == dstEnc.getDimOrdering()); - assert(srcEnc.getHigherOrdering() == dstEnc.getHigherOrdering()); + assert(srcEnc.getDimToLvl() == dstEnc.getDimToLvl()); assert(srcEnc.getPosWidth() == dstEnc.getPosWidth()); assert(srcEnc.getCrdWidth() == dstEnc.getCrdWidth()); @@ -1168,7 +1167,7 @@ class SparseExtractSliceConverter // FIXME: we need to distinguish level sizes and dimension size for slices // here. Maybe we should store slice level sizes in a different array // instead of reusing it. - assert(srcEnc.hasIdDimOrdering()); + assert(srcEnc.isIdentity()); desc.setSpecifierField(rewriter, loc, StorageSpecifierKind::LvlSize, dim, sizeV); desc.setSpecifierField(rewriter, loc, StorageSpecifierKind::DimStride, @@ -1428,26 +1427,26 @@ struct SparseNewOpConverter : public OpConversionPattern { fields, nse); MutSparseTensorDescriptor desc(dstTp, fields); - // Construct the `dim2lvl` buffer for handing off to the runtime library. + // Construct the `dimToLvl` buffer for handing off to the runtime library. // FIXME: This code is (mostly) copied from the SparseTensorConversion.cpp // handling of `NewOp`, and only handles permutations. Fixing this // requires waiting for wrengr to finish redoing the CL that handles // all dim<->lvl stuff more robustly. - SmallVector dim2lvlValues(dimRank); + SmallVector dimToLvlValues(dimRank); if (!dstTp.isIdentity()) { - const auto dimOrder = dstTp.getDimToLvlMap(); - assert(dimOrder.isPermutation() && "Got non-permutation"); + const auto dimToLvl = dstTp.getDimToLvl(); + assert(dimToLvl.isPermutation() && "Got non-permutation"); for (Level l = 0; l < lvlRank; l++) { - const Dimension d = dimOrder.getDimPosition(l); - dim2lvlValues[d] = constantIndex(rewriter, loc, l); + const Dimension d = dimToLvl.getDimPosition(l); + dimToLvlValues[d] = constantIndex(rewriter, loc, l); } } else { // The `SparseTensorType` ctor already ensures `dimRank == lvlRank` // when `isIdentity`; so no need to re-assert it here. for (Dimension d = 0; d < dimRank; d++) - dim2lvlValues[d] = constantIndex(rewriter, loc, d); + dimToLvlValues[d] = constantIndex(rewriter, loc, d); } - Value dim2lvl = allocaBuffer(rewriter, loc, dim2lvlValues); + Value dimToLvl = allocaBuffer(rewriter, loc, dimToLvlValues); // Read the COO tensor data. Value xs = desc.getAOSMemRef(); @@ -1463,7 +1462,7 @@ struct SparseNewOpConverter : public OpConversionPattern { primaryTypeFunctionSuffix(elemTp)}; Value isSorted = createFuncCall(rewriter, loc, readToBuffersFuncName, {boolTp}, - {reader, dim2lvl, xs, ys}, EmitCInterface::On) + {reader, dimToLvl, xs, ys}, EmitCInterface::On) .getResult(0); // If the destination tensor is a sorted COO, we need to sort the COO tensor diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp index 4636615ed24b7..5784506836a2f 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp @@ -96,7 +96,7 @@ static Value createOrFoldLvlCall(OpBuilder &builder, Location loc, // `getDimPosition` checks that the expr isa `AffineDimExpr`, // which is all we care about (for supporting permutations). const Dimension dim = - stt.isIdentity() ? lvl : stt.getDimToLvlMap().getDimPosition(lvl); + stt.isIdentity() ? lvl : stt.getDimToLvl().getDimPosition(lvl); if (const auto sz = stt.getStaticDimSize(dim)) return constantIndex(builder, loc, *sz); // If we cannot statically compute the size from the shape, then we @@ -259,9 +259,9 @@ class NewCallParams final { // TODO: This is only ever used for passing into `genAddEltCall`; // is there a better way to encapsulate that pattern (both to avoid // this one-off getter, and to avoid potential mixups)? - Value getDim2LvlMap() const { - assert(isInitialized() && "Must initialize before getDim2LvlMap"); - return params[kParamDim2Lvl]; + Value getDimToLvl() const { + assert(isInitialized() && "Must initialize before getDimToLvl"); + return params[kParamDimToLvl]; } /// Generates a function call, with the current static parameters @@ -282,8 +282,8 @@ class NewCallParams final { static constexpr unsigned kParamDimSizes = 0; static constexpr unsigned kParamLvlSizes = 1; static constexpr unsigned kParamLvlTypes = 2; - static constexpr unsigned kParamLvl2Dim = 3; - static constexpr unsigned kParamDim2Lvl = 4; + static constexpr unsigned kParamLvlToDim = 3; + static constexpr unsigned kParamDimToLvl = 4; static constexpr unsigned kParamPosTp = 5; static constexpr unsigned kParamCrdTp = 6; static constexpr unsigned kParamValTp = 7; @@ -311,39 +311,39 @@ NewCallParams &NewCallParams::genBuffers(SparseTensorType stt, "Dimension-rank mismatch"); params[kParamDimSizes] = allocaBuffer(builder, loc, dimSizes); // The level-sizes array must be passed as well, since for arbitrary - // dim2lvl mappings it cannot be trivially reconstructed at runtime. + // dimToLvl mappings it cannot be trivially reconstructed at runtime. // For now however, since we're still assuming permutations, we will - // initialize this parameter alongside the `dim2lvl` and `lvl2dim` + // initialize this parameter alongside the `dimToLvl` and `lvlToDim` // parameters below. We preinitialize `lvlSizes` for code symmetry. SmallVector lvlSizes(lvlRank); // The dimension-to-level mapping and its inverse. We must preinitialize - // `dim2lvl` so that the true branch below can perform random-access - // `operator[]` assignment. We preinitialize `lvl2dim` for code symmetry. - SmallVector dim2lvl(dimRank); - SmallVector lvl2dim(lvlRank); + // `dimToLvl` so that the true branch below can perform random-access + // `operator[]` assignment. We preinitialize `lvlToDim` for code symmetry. + SmallVector dimToLvl(dimRank); + SmallVector lvlToDim(lvlRank); if (!stt.isIdentity()) { - const auto dimOrder = stt.getDimToLvlMap(); - assert(dimOrder.isPermutation()); + const auto dimToLvlMap = stt.getDimToLvl(); + assert(dimToLvlMap.isPermutation()); for (Level l = 0; l < lvlRank; l++) { // The `d`th source variable occurs in the `l`th result position. - const Dimension d = dimOrder.getDimPosition(l); - dim2lvl[d] = constantIndex(builder, loc, l); - lvl2dim[l] = constantIndex(builder, loc, d); + const Dimension d = dimToLvlMap.getDimPosition(l); + dimToLvl[d] = constantIndex(builder, loc, l); + lvlToDim[l] = constantIndex(builder, loc, d); lvlSizes[l] = dimSizes[d]; } } else { // The `SparseTensorType` ctor already ensures `dimRank == lvlRank` // when `isIdentity`; so no need to re-assert it here. for (Level l = 0; l < lvlRank; l++) { - dim2lvl[l] = lvl2dim[l] = constantIndex(builder, loc, l); + dimToLvl[l] = lvlToDim[l] = constantIndex(builder, loc, l); lvlSizes[l] = dimSizes[l]; } } params[kParamLvlSizes] = allocaBuffer(builder, loc, lvlSizes); - params[kParamLvl2Dim] = allocaBuffer(builder, loc, lvl2dim); - params[kParamDim2Lvl] = stt.isIdentity() - ? params[kParamLvl2Dim] - : allocaBuffer(builder, loc, dim2lvl); + params[kParamLvlToDim] = allocaBuffer(builder, loc, lvlToDim); + params[kParamDimToLvl] = stt.isIdentity() + ? params[kParamLvlToDim] + : allocaBuffer(builder, loc, dimToLvl); // Secondary and primary types encoding. setTemplateTypes(stt); // Finally, make note that initialization is complete. @@ -383,9 +383,9 @@ static void genDelIteratorCall(OpBuilder &builder, Location loc, Type elemTp, /// t->add(&val, [i1,..,ik], [p1,..,pk]); static void genAddEltCall(OpBuilder &builder, Location loc, Type eltType, Value lvlCOO, Value valPtr, Value dimCoords, - Value dim2lvl) { + Value dimToLvl) { SmallString<9> name{"addElt", primaryTypeFunctionSuffix(eltType)}; - SmallVector params{lvlCOO, valPtr, dimCoords, dim2lvl}; + SmallVector params{lvlCOO, valPtr, dimCoords, dimToLvl}; Type pTp = getOpaquePointerType(builder); createFuncCall(builder, loc, name, pTp, params, EmitCInterface::On); } @@ -481,7 +481,7 @@ genSparse2SparseReshape(ReshapeOp op, typename ReshapeOp::Adaptor adaptor, SmallVector srcDimSizes = getDimSizes(rewriter, loc, srcTp, adaptor.getSrc()); NewCallParams params(rewriter, loc); - Value iter = params.genBuffers(srcTp.withoutOrdering(), srcDimSizes) + Value iter = params.genBuffers(srcTp.withoutDimToLvl(), srcDimSizes) .genNewCall(Action::kToIterator, adaptor.getSrc()); // Start a new COO for the destination tensor. SmallVector dstDimSizes; @@ -493,7 +493,7 @@ genSparse2SparseReshape(ReshapeOp op, typename ReshapeOp::Adaptor adaptor, dstTp.getDimShape(), op.getReassociationIndices()); const Value coo = params.genBuffers(dstTp, dstDimSizes).genNewCall(Action::kEmptyCOO); - const Value dstPerm = params.getDim2LvlMap(); + const Value dstDimToLvl = params.getDimToLvl(); // Construct a while loop over the iterator. const Type iTp = rewriter.getIndexType(); const Value srcDimCoords = genAlloca(rewriter, loc, srcTp.getDimRank(), iTp); @@ -515,7 +515,7 @@ genSparse2SparseReshape(ReshapeOp op, typename ReshapeOp::Adaptor adaptor, assert(dstTp.getDimRank() == dstDimSizes.size()); reshapeCoords(loc, rewriter, op.getReassociationIndices(), srcDimSizes, srcDimCoords, dstDimSizes, dstDimCoords); - genAddEltCall(rewriter, loc, elemTp, coo, elemPtr, dstDimCoords, dstPerm); + genAddEltCall(rewriter, loc, elemTp, coo, elemPtr, dstDimCoords, dstDimToLvl); rewriter.create(loc); // Final call to construct sparse tensor storage and free temporary resources. rewriter.setInsertionPointAfter(whileOp); @@ -544,7 +544,7 @@ static void genSparseCOOIterationLoop( const Type elemTp = stt.getElementType(); // Start an iterator over the tensor (in coordinate order). - const auto noPerm = stt.withoutOrdering(); + const auto noPerm = stt.withoutDimToLvl(); SmallVector dimSizes = getDimSizes(rewriter, loc, noPerm, t); Value iter = NewCallParams(rewriter, loc) .genBuffers(noPerm, dimSizes) @@ -714,7 +714,7 @@ class SparseTensorNewConverter : public OpConversionPattern { SmallVector dimShapeValues = getDimShape(rewriter, loc, stt); Value dimShapeBuffer = allocaBuffer(rewriter, loc, dimShapeValues); // Allocate `SparseTensorReader` and perform all initial setup that - // does not depend on lvlSizes (nor dim2lvl, lvl2dim, etc). + // does not depend on lvlSizes (nor dimToLvl, lvlToDim, etc). Type opaqueTp = getOpaquePointerType(rewriter); Value valTp = constantPrimaryTypeEncoding(rewriter, loc, stt.getElementType()); @@ -729,7 +729,7 @@ class SparseTensorNewConverter : public OpConversionPattern { // compile-time. If dimShape is dynamic, then we'll need to generate // code for computing lvlSizes from the `reader`'s actual dimSizes. // - // TODO: For now we're still assuming `dim2lvl` is a permutation. + // TODO: For now we're still assuming `dimToLvl` is a permutation. // But since we're computing lvlSizes here (rather than in the runtime), // we can easily generalize that simply by adjusting this code. // @@ -744,31 +744,31 @@ class SparseTensorNewConverter : public OpConversionPattern { .getResult(0); } Value lvlSizesBuffer; - Value lvl2dimBuffer; - Value dim2lvlBuffer; + Value lvlToDimBuffer; + Value dimToLvlBuffer; if (!stt.isIdentity()) { - const auto dimOrder = stt.getDimToLvlMap(); - assert(dimOrder.isPermutation() && "Got non-permutation"); - // We preinitialize `dim2lvlValues` since we need random-access writing. + const auto dimToLvl = stt.getDimToLvl(); + assert(dimToLvl.isPermutation() && "Got non-permutation"); + // We preinitialize `dimToLvlValues` since we need random-access writing. // And we preinitialize the others for stylistic consistency. SmallVector lvlSizeValues(lvlRank); - SmallVector lvl2dimValues(lvlRank); - SmallVector dim2lvlValues(dimRank); + SmallVector lvlToDimValues(lvlRank); + SmallVector dimToLvlValues(dimRank); for (Level l = 0; l < lvlRank; l++) { // The `d`th source variable occurs in the `l`th result position. - Dimension d = dimOrder.getDimPosition(l); + Dimension d = dimToLvl.getDimPosition(l); Value lvl = constantIndex(rewriter, loc, l); Value dim = constantIndex(rewriter, loc, d); - dim2lvlValues[d] = lvl; - lvl2dimValues[l] = dim; + dimToLvlValues[d] = lvl; + lvlToDimValues[l] = dim; lvlSizeValues[l] = stt.isDynamicDim(d) ? rewriter.create(loc, dimSizesBuffer, dim) : dimShapeValues[d]; } lvlSizesBuffer = allocaBuffer(rewriter, loc, lvlSizeValues); - lvl2dimBuffer = allocaBuffer(rewriter, loc, lvl2dimValues); - dim2lvlBuffer = allocaBuffer(rewriter, loc, dim2lvlValues); + lvlToDimBuffer = allocaBuffer(rewriter, loc, lvlToDimValues); + dimToLvlBuffer = allocaBuffer(rewriter, loc, dimToLvlValues); } else { // The `SparseTensorType` ctor already ensures `dimRank == lvlRank` // when `isIdentity`; so no need to re-assert it here. @@ -777,15 +777,15 @@ class SparseTensorNewConverter : public OpConversionPattern { for (Level l = 0; l < lvlRank; l++) iotaValues.push_back(constantIndex(rewriter, loc, l)); lvlSizesBuffer = dimSizesBuffer ? dimSizesBuffer : dimShapeBuffer; - dim2lvlBuffer = lvl2dimBuffer = allocaBuffer(rewriter, loc, iotaValues); + dimToLvlBuffer = lvlToDimBuffer = allocaBuffer(rewriter, loc, iotaValues); } // Use the `reader` to parse the file. SmallVector params{ reader, lvlSizesBuffer, genLvlTypesBuffer(rewriter, loc, stt), - lvl2dimBuffer, - dim2lvlBuffer, + lvlToDimBuffer, + dimToLvlBuffer, constantPosTypeEncoding(rewriter, loc, stt.getEncoding()), constantCrdTypeEncoding(rewriter, loc, stt.getEncoding()), valTp}; @@ -895,10 +895,8 @@ class SparseTensorConvertConverter : public OpConversionPattern { // Set up encoding with right mix of src and dst so that the two // method calls can share most parameters, while still providing // the correct sparsity information to either of them. - const auto mixedEnc = SparseTensorEncodingAttr::get( - op->getContext(), dstEnc.getLvlTypes(), dstEnc.getDimOrdering(), - dstEnc.getHigherOrdering(), srcEnc.getPosWidth(), - srcEnc.getCrdWidth()); + const auto mixedEnc = + dstEnc.withBitWidths(srcEnc.getPosWidth(), srcEnc.getCrdWidth()); // TODO: This is the only place where `kToCOO` (or `kToIterator`) // is called with a non-identity permutation. Is there any clean // way to push the permutation over to the `kFromCOO` side instead? @@ -927,7 +925,7 @@ class SparseTensorConvertConverter : public OpConversionPattern { const auto dstEnc = SparseTensorEncodingAttr::get( op->getContext(), SmallVector(dimRank, DimLevelType::Dense), AffineMap(), - AffineMap(), srcEnc.getPosWidth(), srcEnc.getCrdWidth()); + srcEnc.getPosWidth(), srcEnc.getCrdWidth()); SmallVector dimSizes = getDimSizes(rewriter, loc, srcTp, src); Value iter = NewCallParams(rewriter, loc) .genBuffers(dstTp.withEncoding(dstEnc), dimSizes) @@ -996,7 +994,7 @@ class SparseTensorConvertConverter : public OpConversionPattern { params.genBuffers(dstTp, dimSizes).genNewCall(Action::kEmptyCOO); const Type iTp = rewriter.getIndexType(); Value dimCoords = genAlloca(rewriter, loc, dimRank, iTp); - Value perm = params.getDim2LvlMap(); + Value dimToLvl = params.getDimToLvl(); Value elemPtr = genAllocaScalar(rewriter, loc, elemTp); genDenseTensorOrSparseConstantIterLoop( rewriter, loc, src, dimRank, @@ -1004,7 +1002,8 @@ class SparseTensorConvertConverter : public OpConversionPattern { assert(dcvs.size() == static_cast(dimRank)); storeAll(builder, loc, dimCoords, dcvs); builder.create(loc, val, elemPtr); - genAddEltCall(builder, loc, elemTp, coo, elemPtr, dimCoords, perm); + genAddEltCall(builder, loc, elemTp, coo, elemPtr, dimCoords, + dimToLvl); }); // Final call to construct sparse tensor storage. Value dst = params.genNewCall(Action::kFromCOO, coo); @@ -1284,7 +1283,7 @@ class SparseTensorConcatConverter : public OpConversionPattern { const Dimension dimRank = dstTp.getDimRank(); Value dst; // destination tensor - Value dstPerm; // destination tensor permutation (if sparse out) + Value dstDimToLvl; // destination tensor permutation (if sparse out) // A pointer to the value being inserted (if dense => sparse) Value elemPtr; // Memory that holds the dim-coords for destination tensor (if sparse out) @@ -1318,7 +1317,7 @@ class SparseTensorConcatConverter : public OpConversionPattern { dst = reshapeValuesToLevels(rewriter, loc, dstEnc, dimSizes, dst, dstDimCoords); } else { - dstPerm = params.getDim2LvlMap(); + dstDimToLvl = params.getDimToLvl(); elemPtr = genAllocaScalar(rewriter, loc, elemTp); } } else { @@ -1350,7 +1349,7 @@ class SparseTensorConcatConverter : public OpConversionPattern { // Case: sparse => sparse, except for annotated all dense. storeAll(builder, loc, dstDimCoords, dcvs); genAddEltCall(builder, loc, elemTp, dst, elemPtr, dstDimCoords, - dstPerm); + dstDimToLvl); } else { // Case: sparse => dense, or annotated all dense. const auto lcvs = allDense ? dcvs2lcvs(dcvs) : dcvs; @@ -1368,7 +1367,7 @@ class SparseTensorConcatConverter : public OpConversionPattern { Value val = genValueForDense(builder, loc, adaptedOp, dcvs); builder.create(loc, val, elemPtr); genAddEltCall(builder, loc, elemTp, dst, elemPtr, dstDimCoords, - dstPerm); + dstDimToLvl); } else { // Case: dense => dense, or annotated all dense. Value val = genValueForDense(builder, loc, adaptedOp, dcvs); @@ -1420,7 +1419,7 @@ class SparseTensorOutConverter : public OpConversionPattern { Value src = adaptor.getOperands()[0]; SmallVector dimSizes = getDimSizes(rewriter, loc, srcTp, src); Value coo = NewCallParams(rewriter, loc) - .genBuffers(srcTp.withoutOrdering(), dimSizes) + .genBuffers(srcTp.withoutDimToLvl(), dimSizes) .genNewCall(Action::kToCOO, src); // Then output the tensor to external file with coordinates in the // externally visible lexicographic coordinate order. A sort is diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp index ce9490cfddef1..de0f2f7346485 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -128,13 +128,14 @@ static void sizesForTensor(OpBuilder &builder, SmallVectorImpl &sizes, // TODO: The dim level property of the COO type relies on input tensors, the // shape relies on the output tensor -static RankedTensorType -getUnorderedCOOFromTypeWithOrdering(RankedTensorType src, AffineMap ordering) { - return getCOOFromTypeWithOrdering(src, ordering, false); +static RankedTensorType getCOOType(const SparseTensorType &stt, bool ordered) { + return getCOOFromTypeWithOrdering(stt, stt.getDimToLvl(), ordered); } -static RankedTensorType getUnorderedCOOFromType(RankedTensorType src) { - return getCOOFromType(src, false); +static RankedTensorType getBufferType(const SparseTensorType &stt, + bool needTmpCOO) { + return needTmpCOO ? getCOOType(stt, /*ordered=*/false) + : stt.getRankedTensorType(); } /// Collects the dynamic dimension sizes for `tp` with the assumption that @@ -411,10 +412,9 @@ struct TensorReshapeRewriter : public OpRewritePattern { Value nnz = rewriter.create(loc, srcTensor); // Only need an unordered COO buffer if input and output are not sorted // in the same way. - Type bufferTp = - srcTp.isAllOrdered() && srcTp.isIdentity() && dstTp.isIdentity() - ? dstTp.getRankedTensorType() - : getUnorderedCOOFromType(dstTp); + Type bufferTp = getBufferType( + dstTp.withoutDimToLvl(), + !srcTp.isAllOrdered() || !srcTp.isIdentity() || !dstTp.isIdentity()); SmallVector dynSizes; Value buffer = rewriter .create(loc, bufferTp, dynSizes, Value(), @@ -522,10 +522,9 @@ struct Sparse2SparseReshapeRewriter : public OpRewritePattern { Value nnz = rewriter.create(loc, srcTensor); // Only need a unordered COO buffer if input and output are not sorted // in the same way. - Type bufferTp = - srcTp.isAllOrdered() && srcTp.isIdentity() && dstTp.isIdentity() - ? dstTp.getRankedTensorType() - : getUnorderedCOOFromType(dstTp); + Type bufferTp = getBufferType( + dstTp.withoutDimToLvl(), + !srcTp.isAllOrdered() || !srcTp.isIdentity() || !dstTp.isIdentity()); Value buffer = rewriter @@ -648,12 +647,12 @@ struct ConcatenateRewriter : public OpRewritePattern { Value annotatedDenseDst; if (dstTp.hasEncoding()) { bool allOrdered = false; - // When concatenating on dimension 0, and all inputs are sorted and have - // an identity dimOrdering, the concatenate will generate coords in - // lexOrder thus no need for the tmp COO buffer. + // When concatenating on dimension 0, and all inputs are sorted + // and have an identity dimToLvl, the concatenate will generate + // coords in lexOrder thus no need for the tmp COO buffer. // TODO: When conDim != 0, as long as conDim is the first dimension // in all input/output buffers, and all input/output buffers have the same - // dimOrdering, the tmp COO buffer is still unnecessary (e.g, concatenate + // dimToLvl, the tmp COO buffer is still unnecessary (e.g, concatenate // CSC matrices along column). if (!allDense && conDim == 0 && dstTp.isIdentity()) { for (auto i : op.getInputs()) { @@ -665,8 +664,8 @@ struct ConcatenateRewriter : public OpRewritePattern { } needTmpCOO = !allDense && !allOrdered; - const RankedTensorType tp = needTmpCOO ? getUnorderedCOOFromType(dstTp) - : dstTp.getRankedTensorType(); + const RankedTensorType tp = + getBufferType(dstTp.withoutDimToLvl(), needTmpCOO); encDst = needTmpCOO ? getSparseTensorEncoding(tp) : encDst; SmallVector dynSizes; getDynamicSizes(dstTp, sizes, dynSizes); @@ -831,16 +830,20 @@ struct ConvertRewriter : public OpRewritePattern { // COO tensor. // TODO: enhance foreachOp to take ordering to remove the need of a // temporary COO tensor here. - const RankedTensorType bufferTp = dstTp.isIdentity() || fromSparseConst - ? dstTp.getRankedTensorType() - : getUnorderedCOOFromTypeWithOrdering( - dstTp, dstTp.getDimToLvlMap()); + const RankedTensorType bufferTp = + getBufferType(dstTp, !dstTp.isIdentity() && !fromSparseConst); // Only imposes foreach order on dense constant (which will be statically // sorted by the sparse compiler), otherwise the rotated loop sequence // results to bad cache locality. - AffineMapAttr foreachOrder = nullptr; - if (encDst.getDimOrdering() && fromSparseConst) - foreachOrder = AffineMapAttr::get(encDst.getDimOrdering()); + const AffineMapAttr foreachOrder = + (!dstTp.isIdentity() && fromSparseConst) + ? AffineMapAttr::get(dstTp.getExpandedDimToLvl()) + : nullptr; + // TODO: This assertion is to match the behavior from before we merged + // dimOrdering and higherOrdering into dimToLvl. Although the above + // can construct `foreachOrder` for non-permutations, it's not clear + // that the `foreachOp` below actually supports non-permutations. + assert(!foreachOrder || dstTp.isPermutation()); auto buffer = rewriter.create(loc, bufferTp, dynSizes).getResult(); @@ -950,17 +953,16 @@ struct ConvertRewriter : public OpRewritePattern { // 1. the src tensor is not a COO and // 2. the src tensor is not ordered in the same way as the target // tensor (e.g., src tensor is not ordered or src tensor haves a different - // dimOrdering). + // dimToLvl). if (const SparseTensorType srcTp(srcRTT); - !(srcTp.isAllOrdered() && srcTp.hasSameDimToLvlMap(dstTp))) { + !(srcTp.isAllOrdered() && srcTp.hasSameDimToLvl(dstTp))) { // Construct a COO tensor from the src tensor. // TODO: there may be cases for which more efficiently without // going through an intermediate COO, such as cases that only change // the overhead types. SmallVector dynSrcSizes; getDynamicSizes(srcRTT, srcSizes, dynSrcSizes); - srcRTT = - getUnorderedCOOFromTypeWithOrdering(srcRTT, dstTp.getDimToLvlMap()); + srcRTT = getCOOType(srcTp.withDimToLvl(dstTp), /*ordered=*/false); // Ensure that mutating `srcRTT` didn't invalidate `dimRank`. assert(static_cast(srcRTT.getRank()) == dimRank); tmpCoo = rewriter @@ -995,7 +997,7 @@ struct ConvertRewriter : public OpRewritePattern { // Sort the COO tensor so that its elements are ordered via increasing // coordinates for the storage ordering of the dst tensor. Use SortCoo // if the COO tensor has the same ordering as the dst tensor. - if (dimRank > 1 && srcTp.hasSameDimToLvlMap(dstTp)) { + if (dimRank > 1 && srcTp.hasSameDimToLvl(dstTp)) { Value xs = genToCoordinatesBuffer(rewriter, loc, src); rewriter.create( loc, nnz, xs, ValueRange{y}, rewriter.getIndexAttr(dimRank), @@ -1174,8 +1176,7 @@ struct NewRewriter : public OpRewritePattern { // Implement the NewOp as follows: // %orderedCoo = sparse_tensor.new %filename // %t = sparse_tensor.convert %orderedCoo - RankedTensorType cooTp = - getCOOFromTypeWithOrdering(dstTp, encDst.getDimOrdering(), true); + RankedTensorType cooTp = getCOOType(dstTp, /*ordered=*/true); Value cooTensor = rewriter.create(loc, cooTp, op.getSource()); Value convert = rewriter.replaceOpWithNewOp( op, dstTp.getRankedTensorType(), cooTensor); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp index 1b711992a30d5..4334290de6498 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -1920,11 +1920,14 @@ struct GenericOpSparsifier : public OpRewritePattern { // especially if it is a direct yield! // auto srcTp = getRankedTensorType(tval); - auto dstEnc = SparseTensorEncodingAttr::get( - getContext(), srcEnc.getLvlTypes(), - permute(env, env.op().getMatchingIndexingMap(t)), // new order - srcEnc.getHigherOrdering(), srcEnc.getPosWidth(), - srcEnc.getCrdWidth()); + // TODO: This assertion is to match the behavior from prior to + // merging dimOrdering and higherOrdering into dimToLvl. However, + // since `permute` returns a permutation, we can remove this + // restriction by instead composing the result of `permute` + // with `srcEnc.getDimToLvl`. + assert(srcEnc.isPermutation()); + auto dstEnc = + srcEnc.withDimToLvl(permute(env, env.op().getMatchingIndexingMap(t))); auto dstTp = RankedTensorType::get(srcTp.getShape(), srcTp.getElementType(), dstEnc); auto convert = rewriter.create(tval.getLoc(), dstTp, tval); diff --git a/mlir/test/CAPI/sparse_tensor.c b/mlir/test/CAPI/sparse_tensor.c index 306fa8e05ab29..6449a8f0c7940 100644 --- a/mlir/test/CAPI/sparse_tensor.c +++ b/mlir/test/CAPI/sparse_tensor.c @@ -26,8 +26,7 @@ static int testRoundtripEncoding(MlirContext ctx) { const char *originalAsm = "#sparse_tensor.encoding<{ " "lvlTypes = [ \"dense\", \"compressed\", \"compressed\"], " - "dimOrdering = affine_map<(d0, d1, d2) -> (d0, d1, d2)>, " - "higherOrdering = affine_map<(d0, d1)[s0] -> (s0, d0, d1)>, " + "dimToLvl = affine_map<(d0, d1)[s0] -> (s0, d0, d1)>, " "posWidth = 32, crdWidth = 64 }>"; // clang-format on MlirAttribute originalAttr = @@ -35,14 +34,10 @@ static int testRoundtripEncoding(MlirContext ctx) { // CHECK: isa: 1 fprintf(stderr, "isa: %d\n", mlirAttributeIsASparseTensorEncodingAttr(originalAttr)); - MlirAffineMap dimOrdering = - mlirSparseTensorEncodingAttrGetDimOrdering(originalAttr); - // CHECK: (d0, d1, d2) -> (d0, d1, d2) - mlirAffineMapDump(dimOrdering); - MlirAffineMap higherOrdering = - mlirSparseTensorEncodingAttrGetHigherOrdering(originalAttr); + MlirAffineMap dimToLvl = + mlirSparseTensorEncodingAttrGetDimToLvl(originalAttr); // CHECK: (d0, d1)[s0] -> (s0, d0, d1) - mlirAffineMapDump(higherOrdering); + mlirAffineMapDump(dimToLvl); // CHECK: level_type: 4 // CHECK: level_type: 8 // CHECK: level_type: 8 @@ -61,7 +56,7 @@ static int testRoundtripEncoding(MlirContext ctx) { fprintf(stderr, "crdWidth: %d\n", crdWidth); MlirAttribute newAttr = mlirSparseTensorEncodingAttrGet( - ctx, lvlRank, lvlTypes, dimOrdering, higherOrdering, posWidth, crdWidth); + ctx, lvlRank, lvlTypes, dimToLvl, posWidth, crdWidth); mlirAttributeDump(newAttr); // For debugging filecheck output. // CHECK: equal: 1 fprintf(stderr, "equal: %d\n", mlirAttributeEqual(originalAttr, newAttr)); diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir index 243f3ae4513ee..f83d89961e828 100644 --- a/mlir/test/Dialect/SparseTensor/codegen.mlir +++ b/mlir/test/Dialect/SparseTensor/codegen.mlir @@ -32,7 +32,7 @@ #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i, j) -> (j, i)> + dimToLvl = affine_map<(i, j) -> (j, i)> }> #DCSR = #sparse_tensor.encoding<{ @@ -43,7 +43,7 @@ #Dense3D = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense", "dense" ], - dimOrdering = affine_map<(i, j, k) -> (k, i, j)> + dimToLvl = affine_map<(i, j, k) -> (k, i, j)> }> #Coo = #sparse_tensor.encoding<{ @@ -52,7 +52,7 @@ #CooPNo = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton-no" ], - dimOrdering = affine_map<(i, j) -> (j, i)> + dimToLvl = affine_map<(i, j) -> (j, i)> }> #ccoo = #sparse_tensor.encoding<{ @@ -189,7 +189,7 @@ func.func @sparse_dense_3d(%arg0: tensor<10x20x30xf64, #Dense3D>) -> index { // // Querying for dimension 1 in the tensor type needs to be permuted // into querying for dimension 2 in the stored sparse tensor scheme, -// since the latter honors the dimOrdering. +// since the latter honors the dimToLvl mapping. // // CHECK-LABEL: func @sparse_dense_3d_dyn( // CHECK-SAME: %[[A0:.*]]: memref, diff --git a/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir b/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir index 0b8e1e86dba30..1aff486e49fb2 100644 --- a/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir +++ b/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir @@ -9,7 +9,7 @@ #CSR = #sparse_tensor.encoding<{ lvlTypes = ["dense", "compressed"]}> #CSC = #sparse_tensor.encoding<{ lvlTypes = ["dense", "compressed"], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir index 3fcbd829765a8..aa432460173cf 100644 --- a/mlir/test/Dialect/SparseTensor/conversion.mlir +++ b/mlir/test/Dialect/SparseTensor/conversion.mlir @@ -22,12 +22,12 @@ #CSC = #sparse_tensor.encoding<{ lvlTypes = ["dense", "compressed"], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #SparseTensor = #sparse_tensor.encoding<{ lvlTypes = ["dense", "compressed", "compressed"], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> // CHECK-LABEL: func @sparse_nop( diff --git a/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir index 33dee7c60653d..ac9a613134ed5 100644 --- a/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir +++ b/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir @@ -12,12 +12,12 @@ #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i, j) -> (j, i)> + dimToLvl = affine_map<(i, j) -> (j, i)> }> #SparseTensor = #sparse_tensor.encoding<{ lvlTypes = ["dense", "compressed", "compressed"], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> // CHECK-LABEL: func @sparse_convert_1d( diff --git a/mlir/test/Dialect/SparseTensor/convert_sparse2dense.mlir b/mlir/test/Dialect/SparseTensor/convert_sparse2dense.mlir index 3045aea07f227..1adc9f9566da3 100644 --- a/mlir/test/Dialect/SparseTensor/convert_sparse2dense.mlir +++ b/mlir/test/Dialect/SparseTensor/convert_sparse2dense.mlir @@ -13,7 +13,7 @@ #SparseTensor = #sparse_tensor.encoding<{ lvlTypes = ["dense", "compressed", "compressed"], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> // CHECK-LABEL: func @sparse_convert_1d( diff --git a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir index 3ecf7698945ad..26f41e142b8b7 100644 --- a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir +++ b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir @@ -36,7 +36,7 @@ #TsssPermuted = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> #COOSlice = #sparse_tensor.encoding<{ diff --git a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir index 5d6f34f4e0697..91c3ef7b6d62d 100644 --- a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir +++ b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir @@ -6,12 +6,14 @@ func.func private @scalar(%arg0: tensor) -> () // ----- +// expected-error@+2 {{dimension-rank mismatch between encoding and tensor shape: 2 != 1}} #a = #sparse_tensor.encoding<{lvlTypes = ["dense", "compressed"]}> -func.func private @tensor_dimlevel_size_mismatch(%arg0: tensor<8xi32, #a>) -> () // expected-error {{expected an array of size 1 for lvlTypes}} +func.func private @tensor_dimlevel_size_mismatch(%arg0: tensor<8xi32, #a>) -> () // ----- -#a = #sparse_tensor.encoding<{lvlTypes = ["dense", "compressed"], dimOrdering = affine_map<(i) -> (i)>}> // expected-error {{level-rank mismatch between dimOrdering and lvlTypes}} +// expected-error@+1 {{level-rank mismatch between dimToLvl and lvlTypes: 1 != 2}} +#a = #sparse_tensor.encoding<{lvlTypes = ["dense", "compressed"], dimToLvl = affine_map<(i) -> (i)>}> func.func private @tensor_sizes_mismatch(%arg0: tensor<8xi32, #a>) -> () // ----- @@ -26,18 +28,13 @@ func.func private @tensor_value_mismatch(%arg0: tensor<8xi32, #a>) -> () // ----- -#a = #sparse_tensor.encoding<{dimOrdering = "wrong"}> // expected-error {{expected an affine map for dimension ordering}} -func.func private @tensor_dimorder_mismatch(%arg0: tensor<8xi32, #a>) -> () +#a = #sparse_tensor.encoding<{dimToLvl = "wrong"}> // expected-error {{expected an affine map for dimToLvl}} +func.func private @tensor_dimtolvl_mismatch(%arg0: tensor<8xi32, #a>) -> () // ----- -#a = #sparse_tensor.encoding<{higherOrdering = "wrong"}> // expected-error {{expected an affine map for higher ordering}} -func.func private @tensor_highorder_mismatch(%arg0: tensor<8xi32, #a>) -> () - -// ----- - -// expected-error@+1 {{expected a permutation affine map for dimension ordering}} -#a = #sparse_tensor.encoding<{lvlTypes = ["dense", "compressed"], dimOrdering = affine_map<(i,j) -> (i,i)>}> +// expected-error@+1 {{expected a permutation affine map for dimToLvl}} +#a = #sparse_tensor.encoding<{lvlTypes = ["dense", "compressed"], dimToLvl = affine_map<(i,j) -> (i,i)>}> func.func private @tensor_no_permutation(%arg0: tensor<16x32xf32, #a>) -> () // ----- @@ -67,11 +64,6 @@ func.func private @tensor_invalid_key(%arg0: tensor<16x32xf32, #a>) -> () // ----- -#a = #sparse_tensor.encoding<{lvlTypes = [ "compressed", "compressed", "dense", "dense" ], dimOrdering = affine_map<(ii, jj, i, j) -> (ii, jj, i, j)>, higherOrdering = affine_map<(i, j) -> (j, i)>}> // expected-error {{unexpected higher ordering mapping from 2 to 2}} -func.func private @tensor_invalid_key(%arg0: tensor<10x60xf32, #a>) -> () - -// ----- - #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], slice = [ (-1, ?, 1), (?, 4, 2) ] // expected-error{{expect positive value or ? for slice offset/size/stride}} diff --git a/mlir/test/Dialect/SparseTensor/one_shot_bufferize_tensor_copy_insertion.mlir b/mlir/test/Dialect/SparseTensor/one_shot_bufferize_tensor_copy_insertion.mlir index 42ca38ab09985..93bcfe0d0cefb 100644 --- a/mlir/test/Dialect/SparseTensor/one_shot_bufferize_tensor_copy_insertion.mlir +++ b/mlir/test/Dialect/SparseTensor/one_shot_bufferize_tensor_copy_insertion.mlir @@ -3,7 +3,7 @@ #DCSR = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (i,j)> + dimToLvl = affine_map<(i,j) -> (i,j)> }> // CHECK-LABEL: func @bufferization_alloc_tensor diff --git a/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir b/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir index e0043f12b6cb3..0bdeeeeece870 100644 --- a/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir +++ b/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir @@ -7,7 +7,7 @@ #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i, j) -> (j, i)> + dimToLvl = affine_map<(i, j) -> (j, i)> }> #COO = #sparse_tensor.encoding<{ @@ -26,8 +26,8 @@ func.func @sparse_new(%arg0: !llvm.ptr) -> tensor { } // CHECK-LABEL: func.func @sparse_new_csc( -// CHECK-SAME: %[[A:.*]]: !llvm.ptr) -> tensor (d1, d0)> }>> { -// CHECK: %[[COO:.*]] = sparse_tensor.new %[[A]] : !llvm.ptr to tensor (d1, d0)> }>> +// CHECK-SAME: %[[A:.*]]: !llvm.ptr) -> tensor (d1, d0)> }>> { +// CHECK: %[[COO:.*]] = sparse_tensor.new %[[A]] : !llvm.ptr to tensor (d1, d0)> }>> // CHECK: %[[R:.*]] = sparse_tensor.convert %[[COO]] // CHECK: bufferization.dealloc_tensor %[[COO]] // CHECK: return %[[R]] diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir index 891d94659587b..4a7cd76ac489f 100644 --- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir +++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir @@ -8,7 +8,7 @@ func.func private @sparse_1d_tensor(tensor<32xf64, #sparse_tensor.encoding<{ lvl #CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (i,j)>, + dimToLvl = affine_map<(i,j) -> (i,j)>, posWidth = 64, crdWidth = 64 }> @@ -21,26 +21,26 @@ func.func private @sparse_csr(tensor) #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)>, + dimToLvl = affine_map<(i,j) -> (j,i)>, posWidth = 0, crdWidth = 0 }> // CHECK-LABEL: func private @sparse_csc( -// CHECK-SAME: tensor (d1, d0)> }>>) +// CHECK-SAME: tensor (d1, d0)> }>>) func.func private @sparse_csc(tensor) // ----- #DCSC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)>, + dimToLvl = affine_map<(i,j) -> (j,i)>, posWidth = 0, crdWidth = 64 }> // CHECK-LABEL: func private @sparse_dcsc( -// CHECK-SAME: tensor (d1, d0)>, crdWidth = 64 }>>) +// CHECK-SAME: tensor (d1, d0)>, crdWidth = 64 }>>) func.func private @sparse_dcsc(tensor) // ----- @@ -77,12 +77,11 @@ func.func private @sparse_sorted_coo(tensor<10x10xf64, #SortedCOO>) #BCSR = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "dense", "dense" ], - dimOrdering = affine_map<(ii, jj, i, j) -> (ii, jj, i, j)>, - higherOrdering = affine_map<(i, j) -> (i floordiv 2, j floordiv 3, i mod 2, j mod 3)> + dimToLvl = affine_map<(i, j) -> (i floordiv 2, j floordiv 3, i mod 2, j mod 3)> }> // CHECK-LABEL: func private @sparse_bcsr( -// CHECK-SAME: tensor<10x60xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "dense", "dense" ], higherOrdering = affine_map<(d0, d1) -> (d0 floordiv 2, d1 floordiv 3, d0 mod 2, d1 mod 3)> }>> +// CHECK-SAME: tensor<10x60xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "dense", "dense" ], dimToLvl = affine_map<(d0, d1) -> (d0 floordiv 2, d1 floordiv 3, d0 mod 2, d1 mod 3)> }>> func.func private @sparse_bcsr(tensor<10x60xf64, #BCSR>) @@ -90,12 +89,11 @@ func.func private @sparse_bcsr(tensor<10x60xf64, #BCSR>) #ELL = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense", "compressed" ], - dimOrdering = affine_map<(ii, i, j) -> (ii, i, j)>, - higherOrdering = affine_map<(i,j)[c] -> (c*4*i, i, j)> + dimToLvl = affine_map<(i,j)[c] -> (c*4*i, i, j)> }> // CHECK-LABEL: func private @sparse_ell( -// CHECK-SAME: tensor (d0 * (s0 * 4), d0, d1)> }>> +// CHECK-SAME: tensor (d0 * (s0 * 4), d0, d1)> }>> func.func private @sparse_ell(tensor) // ----- diff --git a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir index b9aa0b8836c3f..4aecea4e0c2b4 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir @@ -4,12 +4,12 @@ #SparseMatrix_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #SparseMatrix_D_P = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // CHECK-LABEL: func.func @concat_mix_dense( diff --git a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir index 3bb546c896ca4..af6780396c386 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_concat_codegen.mlir @@ -5,7 +5,7 @@ #DENSE = #sparse_tensor.encoding<{lvlTypes = ["dense", "dense"]}> #DENSE_P = #sparse_tensor.encoding<{ lvlTypes = ["dense", "dense"], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // CHECK-LABEL: @concat_sparse_sparse( // CHECK-SAME: %[[TMP_arg0:.*]]: tensor<2x4xf64, #sparse_tensor @@ -417,7 +417,7 @@ func.func @concat_sparse_sparse_annotated_dense(%arg0: tensor<2x4xf64, #DCSR>, // CHECK: } // CHECK: } // CHECK: %[[R:.*]] = sparse_tensor.convert %[[TMP_0]] -// CHECK: return %[[R]] : tensor (d1, d0)> }>> +// CHECK: return %[[R]] : tensor (d1, d0)> }>> func.func @concat_sparse_sparse_annotated_dense_permute(%arg0: tensor<2x4xf64, #DCSR>, %arg1: tensor<3x4xf64, #DCSR>, %arg2: tensor<4x4xf64, #DCSR>) diff --git a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir index 79306c13fb5b2..d5ba7ecca6c9a 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir @@ -13,12 +13,12 @@ #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #DCSC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #SV = #sparse_tensor.encoding<{ diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir index eec761829368b..ecca5fa363bb4 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir @@ -10,7 +10,7 @@ #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #trait_matvec = { @@ -24,15 +24,15 @@ } // CHECK-HIR-LABEL: func @matvec( -// CHECK-HIR-SAME: %[[VAL_0:.*]]: tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>>, +// CHECK-HIR-SAME: %[[VAL_0:.*]]: tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>>, // CHECK-HIR-SAME: %[[VAL_1:.*]]: tensor<64xf64>, // CHECK-HIR-SAME: %[[VAL_2:.*]]: tensor<32xf64>) -> tensor<32xf64> { // CHECK-HIR-DAG: %[[VAL_3:.*]] = arith.constant 64 : index // CHECK-HIR-DAG: %[[VAL_4:.*]] = arith.constant 0 : index // CHECK-HIR-DAG: %[[VAL_5:.*]] = arith.constant 1 : index -// CHECK-HIR-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> to memref -// CHECK-HIR-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> to memref -// CHECK-HIR-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> to memref +// CHECK-HIR-DAG: %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> to memref +// CHECK-HIR-DAG: %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> to memref +// CHECK-HIR-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> to memref // CHECK-HIR-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64> // CHECK-HIR-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64> // CHECK-HIR: scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] { diff --git a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir index dec9a13073a60..9db54f23f4bdc 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_matmul_codegen.mlir @@ -6,7 +6,7 @@ #CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (i,j)> + dimToLvl = affine_map<(i,j) -> (i,j)> }> // diff --git a/mlir/test/Dialect/SparseTensor/sparse_out.mlir b/mlir/test/Dialect/SparseTensor/sparse_out.mlir index ebeb75e3be6fb..496db65be6ec2 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_out.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_out.mlir @@ -2,12 +2,12 @@ #CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (i,j)> + dimToLvl = affine_map<(i,j) -> (i,j)> }> #DCSR = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (i,j)> + dimToLvl = affine_map<(i,j) -> (i,j)> }> #SparseTensor = #sparse_tensor.encoding<{ diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir index c53b32b43224f..438f2c496d891 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir @@ -3,7 +3,7 @@ #X = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense", "dense" ], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> #trait = { diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir index cdf87909bb383..2e3d723889cdd 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir @@ -5,7 +5,7 @@ #X = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense", "dense" ], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> #trait = { diff --git a/mlir/test/Dialect/SparseTensor/sparse_transpose.mlir b/mlir/test/Dialect/SparseTensor/sparse_transpose.mlir index 9bbcc7aba5d9b..71c4319aa797a 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_transpose.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_transpose.mlir @@ -20,12 +20,12 @@ // CHECK-DAG: %[[VAL_1:.*]] = arith.constant 0 : index // CHECK-DAG: %[[VAL_2:.*]] = arith.constant 1 : index // CHECK-DAG: %[[VAL_3:.*]] = bufferization.alloc_tensor() : tensor<4x3xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> -// CHECK-DAG: %[[VAL_4:.*]] = sparse_tensor.convert %[[VAL_0]] : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> -// CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_4]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> to memref -// CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_4]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> to memref -// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_4]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> to memref -// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_4]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> to memref -// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_4]] : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> to memref +// CHECK-DAG: %[[VAL_4:.*]] = sparse_tensor.convert %[[VAL_0]] : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> to tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> +// CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_4]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> to memref +// CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_4]] {level = 0 : index} : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> to memref +// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_4]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> to memref +// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_4]] {level = 1 : index} : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> to memref +// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_4]] : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> to memref // CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_1]]] : memref // CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_2]]] : memref // CHECK: %[[VAL_12:.*]] = scf.for %[[VAL_13:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_14:.*]] = %[[VAL_3]]) -> (tensor<4x3xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) { @@ -42,7 +42,7 @@ // CHECK: scf.yield %[[VAL_25:.*]] : tensor<4x3xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> // CHECK: } // CHECK: %[[VAL_26:.*]] = sparse_tensor.load %[[VAL_27:.*]] hasInserts : tensor<4x3xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> -// CHECK: bufferization.dealloc_tensor %[[VAL_4]] : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)> }>> +// CHECK: bufferization.dealloc_tensor %[[VAL_4]] : tensor<3x4xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)> }>> // CHECK: return %[[VAL_26]] : tensor<4x3xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>> // CHECK: } func.func @sparse_transpose_auto(%arga: tensor<3x4xf64, #DCSR>) diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_concat.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_concat.mlir index bffdf4dcc9a41..b8521f21836f8 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_vector_concat.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_vector_concat.mlir @@ -6,12 +6,12 @@ #MAT_C_C_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_C_D_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0.mlir index 746881ed65276..0a0f671b96e9e 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0.mlir @@ -31,22 +31,22 @@ #MAT_C_D = #sparse_tensor.encoding<{lvlTypes = ["compressed", "dense"]}> #MAT_D_D = #sparse_tensor.encoding<{ lvlTypes = ["dense", "dense"], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_C_C_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_C_D_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_D_C_P = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> module { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir index e9c430a104598..c870e91f2c059 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir @@ -31,22 +31,22 @@ #MAT_C_D = #sparse_tensor.encoding<{lvlTypes = ["compressed", "dense"]}> #MAT_D_D = #sparse_tensor.encoding<{ lvlTypes = ["dense", "dense"], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_C_C_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_C_D_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_D_C_P = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> module { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir index d5e47b985bcd6..70c3322801057 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir @@ -21,22 +21,22 @@ #MAT_C_D = #sparse_tensor.encoding<{lvlTypes = ["compressed", "dense"]}> #MAT_D_D = #sparse_tensor.encoding<{ lvlTypes = ["dense", "dense"], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_C_C_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_C_D_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_D_C_P = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> module { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir index 760390eeb1f52..0bab30d08e4c7 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir @@ -31,22 +31,22 @@ #MAT_C_D = #sparse_tensor.encoding<{lvlTypes = ["compressed", "dense"]}> #MAT_D_D = #sparse_tensor.encoding<{ lvlTypes = ["dense", "dense"], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_C_C_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_C_D_P = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #MAT_D_C_P = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> module { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir index 32efaf7857c6e..3384fda86c829 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dense_output.mlir @@ -32,12 +32,12 @@ #DenseMatrix = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense" ], - dimOrdering = affine_map<(i,j) -> (i,j)> + dimToLvl = affine_map<(i,j) -> (i,j)> }> #SparseMatrix = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (i,j)> + dimToLvl = affine_map<(i,j) -> (i,j)> }> #trait_assign = { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir index c3fb424d23619..45bc8899da043 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir @@ -36,7 +36,7 @@ #DCSC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #SortedCOO = #sparse_tensor.encoding<{ @@ -45,12 +45,12 @@ #SortedCOOPerm = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #CCCPerm = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed"], - dimOrdering = affine_map<(d0, d1, d2) -> (d1, d2, d0)> + dimToLvl = affine_map<(d0, d1, d2) -> (d1, d2, d0)> }> module { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir index 58ef79dbb77ff..2c7284f7746c7 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir @@ -32,7 +32,7 @@ #CDC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense", "compressed" ] // FIXME: Still inadmissible might need investigation - // dimOrdering = affine_map<(i,j,k) -> (j,k,i)> + // dimToLvl = affine_map<(i,j,k) -> (j,k,i)> }> // Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir index 21d6792b04a64..416c4a76bedd8 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir @@ -31,7 +31,7 @@ #CDR = #sparse_tensor.encoding<{lvlTypes = ["compressed", "dense"]}> #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // An example of a 2D convolution with a sparse filter. diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir index 8d62efecb016e..1a14a3d70ef36 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir @@ -28,17 +28,17 @@ #Tensor1 = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (i,j,k)> + dimToLvl = affine_map<(i,j,k) -> (i,j,k)> }> #Tensor2 = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (j,k,i)> + dimToLvl = affine_map<(i,j,k) -> (j,k,i)> }> #Tensor3 = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> // diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_dyn.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_dyn.mlir index e2fcdc2b56add..a394089f1bccc 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_dyn.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_dyn.mlir @@ -32,7 +32,7 @@ #DCSC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_element.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_element.mlir index e3fbc71880d08..d026ca15b2462 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_element.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_element.mlir @@ -32,7 +32,7 @@ #Tensor3 = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (i,k,j)> + dimToLvl = affine_map<(i,j,k) -> (i,k,j)> }> module { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_ptr.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_ptr.mlir index bff64ea0b71df..7a4b4d7e48f5a 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_ptr.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_ptr.mlir @@ -34,14 +34,14 @@ #DCSC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)>, + dimToLvl = affine_map<(i,j) -> (j,i)>, posWidth = 64, crdWidth = 64 }> #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)>, + dimToLvl = affine_map<(i,j) -> (j,i)>, posWidth = 16, crdWidth = 32 }> diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir index d2658afe08503..758de13c3128a 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2dense.mlir @@ -28,32 +28,32 @@ #Tensor1 = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (i,j,k)> + dimToLvl = affine_map<(i,j,k) -> (i,j,k)> }> #Tensor2 = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (j,k,i)> + dimToLvl = affine_map<(i,j,k) -> (j,k,i)> }> #Tensor3 = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> #Tensor4 = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (i,j,k)> + dimToLvl = affine_map<(i,j,k) -> (i,j,k)> }> #Tensor5 = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (j,k,i)> + dimToLvl = affine_map<(i,j,k) -> (j,k,i)> }> #Tensor6 = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> // diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir index 318d24fc6727a..9bee64c4c775b 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir @@ -39,7 +39,7 @@ #Tensor3 = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "dense", "compressed" ], - dimOrdering = affine_map<(i,j,k) -> (i,k,j)> + dimToLvl = affine_map<(i,j,k) -> (i,k,j)> }> #SingletonTensor1 = #sparse_tensor.encoding<{ diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir index a00f3d7a3011f..2b30551dcbcd0 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_expand.mlir @@ -27,7 +27,7 @@ #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> module { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_flatten.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_flatten.mlir index 7109ef481c5db..796da443be0b1 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_flatten.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_flatten.mlir @@ -33,10 +33,10 @@ #SparseTensor = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed", "compressed", "compressed", "compressed", "compressed", "compressed" ], - // Note that any dimOrdering permutation should give the same results + // Note that any dimToLvl permutation should give the same results // since, even though it impacts the sparse storage scheme layout, // it should not change the semantics. - dimOrdering = affine_map<(i,j,k,l,m,n,o,p) -> (p,o,j,k,i,l,m,n)> + dimToLvl = affine_map<(i,j,k,l,m,n,o,p) -> (p,o,j,k,i,l,m,n)> }> #trait_flatten = { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir index fbcb6ecf8a4cc..8dd6fb94d812e 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir @@ -38,12 +38,12 @@ #CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (i,j)> + dimToLvl = affine_map<(i,j) -> (i,j)> }> #DCSR = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (i,j)> + dimToLvl = affine_map<(i,j) -> (i,j)> }> module { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir index 5b3493a7aa9ad..c476aa3d9154f 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_out_simple.mlir @@ -32,7 +32,7 @@ #DCSR = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (i,j)> + dimToLvl = affine_map<(i,j) -> (i,j)> }> #eltwise_mult = { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom.mlir index babb0cbfdd400..2a01cc632d573 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom.mlir @@ -32,7 +32,7 @@ #CSR = #sparse_tensor.encoding<{lvlTypes = ["dense", "compressed"]}> #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom_prod.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom_prod.mlir index b31793dd62bb8..06d83306f5690 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom_prod.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_reduce_custom_prod.mlir @@ -23,7 +23,7 @@ #CSR = #sparse_tensor.encoding<{lvlTypes = ["dense", "compressed"]}> #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_select.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_select.mlir index 8aeb6a3a5bf56..bcf2dc11e5477 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_select.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_select.mlir @@ -26,7 +26,7 @@ #CSR = #sparse_tensor.encoding<{lvlTypes = ["dense", "compressed"]}> #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir index f4e46944c79d3..a94491a046fcf 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_sorted_coo.mlir @@ -34,7 +34,7 @@ #SortedCOOPermuted = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #SortedCOO3D = #sparse_tensor.encoding<{ @@ -43,7 +43,7 @@ #SortedCOO3DPermuted = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton-nu", "singleton" ], - dimOrdering = affine_map<(i,j,k) -> (k,i,j)> + dimToLvl = affine_map<(i,j,k) -> (k,i,j)> }> #trait_scale = { diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_storage.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_storage.mlir index 2ec6e00447e8d..774ffc731aff6 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_storage.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_storage.mlir @@ -44,12 +44,12 @@ #CSC = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #DCSC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #BlockRow = #sparse_tensor.encoding<{ @@ -58,7 +58,7 @@ #BlockCol = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "dense" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> // diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir index 4481ff570c6e2..7f14877465963 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir @@ -32,7 +32,7 @@ #DCSC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - dimOrdering = affine_map<(i,j) -> (j,i)> + dimToLvl = affine_map<(i,j) -> (j,i)> }> #transpose_trait = { diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_SDDMM.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_SDDMM.py index 1f9b636038318..0cdc7c88bd97f 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/python/test_SDDMM.py +++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_SDDMM.py @@ -155,7 +155,7 @@ def main(): for iwidth in [32]: for e in [True]: attr = st.EncodingAttr.get( - level, ordering, None, pwidth, iwidth + level, ordering, pwidth, iwidth ) opt = f"parallelization-strategy=none" compiler = sparse_compiler.SparseCompiler( diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py index 69f6cdcea967f..01d74a4dc82fa 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py +++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_SpMM.py @@ -145,7 +145,7 @@ def main(): for pwidth in bitwidths: for iwidth in bitwidths: attr = st.EncodingAttr.get( - level, ordering, None, pwidth, iwidth + level, ordering, pwidth, iwidth ) build_compile_and_run_SpMM(attr, compiler) count = count + 1 diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py index 7d77490080205..8f3f4e5af1e58 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py +++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py @@ -91,7 +91,7 @@ def main(): for level in levels: for ordering in orderings: for bwidth in bitwidths: - attr = st.EncodingAttr.get(level, ordering, None, bwidth, bwidth) + attr = st.EncodingAttr.get(level, ordering, bwidth, bwidth) build_compile_and_run_output(attr, compiler) count = count + 1 diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py index 373f7457e0b5f..7425a229106ba 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py +++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py @@ -233,7 +233,7 @@ def main(): for pwidth in bitwidths: for iwidth in bitwidths: attr = st.EncodingAttr.get( - level, ordering, None, pwidth, iwidth + level, ordering, pwidth, iwidth ) types.append(ir.RankedTensorType.get(shape, f64, attr)) # diff --git a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py index b3194f7edecd5..c8cb77086ea34 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py +++ b/mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco.py @@ -387,7 +387,6 @@ def mlir_tensor_attr(self) -> Optional[sparse_tensor.EncodingAttr]: return sparse_tensor.EncodingAttr.get( mlir_storage_format, ir.AffineMap.get_permutation(order), - None, _POS_WIDTH, _CRD_WIDTH, ) diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py index b7a06067b5f56..8d98d670ee4d0 100644 --- a/mlir/test/python/dialects/sparse_tensor/dialect.py +++ b/mlir/test/python/dialects/sparse_tensor/dialect.py @@ -30,14 +30,14 @@ def testEncodingAttr1D(): # CHECK: lvl_types: [] print(f"lvl_types: {casted.lvl_types}") - # CHECK: dim_ordering: None - print(f"dim_ordering: {casted.dim_ordering}") + # CHECK: dim_to_lvl: None + print(f"dim_to_lvl: {casted.dim_to_lvl}") # CHECK: pos_width: 16 print(f"pos_width: {casted.pos_width}") # CHECK: crd_width: 32 print(f"crd_width: {casted.crd_width}") - created = st.EncodingAttr.get(casted.lvl_types, None, None, 0, 0) + created = st.EncodingAttr.get(casted.lvl_types, None, 0, 0) # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }> print(created) # CHECK: created_equal: False @@ -57,12 +57,12 @@ def testEncodingAttr2D(): parsed = Attribute.parse( "#sparse_tensor.encoding<{" ' lvlTypes = [ "dense", "compressed" ],' - " dimOrdering = affine_map<(d0, d1) -> (d1, d0)>," + " dimToLvl = affine_map<(d0, d1) -> (d1, d0)>," " posWidth = 8," " crdWidth = 32" "}>" ) - # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, posWidth = 8, crdWidth = 32 }> + # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)>, posWidth = 8, crdWidth = 32 }> print(parsed) casted = st.EncodingAttr(parsed) @@ -71,17 +71,17 @@ def testEncodingAttr2D(): # CHECK: lvl_types: [, ] print(f"lvl_types: {casted.lvl_types}") - # CHECK: dim_ordering: (d0, d1) -> (d1, d0) - print(f"dim_ordering: {casted.dim_ordering}") + # CHECK: dim_to_lvl: (d0, d1) -> (d1, d0) + print(f"dim_to_lvl: {casted.dim_to_lvl}") # CHECK: pos_width: 8 print(f"pos_width: {casted.pos_width}") # CHECK: crd_width: 32 print(f"crd_width: {casted.crd_width}") created = st.EncodingAttr.get( - casted.lvl_types, casted.dim_ordering, casted.higher_ordering, 8, 32 + casted.lvl_types, casted.dim_to_lvl, 8, 32 ) - # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, posWidth = 8, crdWidth = 32 }> + # CHECK: #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], dimToLvl = affine_map<(d0, d1) -> (d1, d0)>, posWidth = 8, crdWidth = 32 }> print(created) # CHECK: created_equal: True print(f"created_equal: {created == casted}") From c03e6511cf5846a1b619ae96bfb1ef9b3da733b7 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Tue, 30 May 2023 14:47:14 -0700 Subject: [PATCH 161/704] [BOLT] Add skip-non-simple for boltdiff Extra filtering for boltdiff, excluding non-simple functions from comparison. Reviewed By: #bolt, maksfb Differential Revision: https://reviews.llvm.org/D151510 --- bolt/lib/Rewrite/BoltDiff.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/bolt/lib/Rewrite/BoltDiff.cpp b/bolt/lib/Rewrite/BoltDiff.cpp index 25b2fad25b3e7..69ba63a9e83a4 100644 --- a/bolt/lib/Rewrite/BoltDiff.cpp +++ b/bolt/lib/Rewrite/BoltDiff.cpp @@ -83,6 +83,11 @@ static cl::opt NormalizeByBin1( "collection time and sampling rate for this to make sense"), cl::cat(BoltDiffCategory)); +static cl::opt + SkipNonSimple("skip-non-simple", + cl::desc("skip non-simple functions in reporting"), + cl::ReallyHidden, cl::cat(BoltDiffCategory)); + } // end namespace opts namespace llvm { @@ -428,8 +433,10 @@ class RewriteInstanceDiff { llvm::make_second_range(llvm::reverse(LargestDiffs))) { const double Score2 = getNormalizedScore(*BB2, RI2); const double Score1 = getNormalizedScore(*BBMap[BB2], RI1); - outs() << "BB " << BB2->getName() << " from " - << BBToFuncMap[BB2]->getDemangledName() + const BinaryFunction *Func = BBToFuncMap[BB2]; + if (opts::SkipNonSimple && !Func->isSimple()) + continue; + outs() << "BB " << BB2->getName() << " from " << Func->getDemangledName() << "\n\tScore bin1 = " << format("%.4f", Score1 * 100.0) << "%\n\tScore bin2 = " << format("%.4f", Score2 * 100.0); outs() << "%\t(Difference: "; @@ -460,9 +467,12 @@ class RewriteInstanceDiff { EdgeTy &Edge1 = EI.second; const double Score2 = std::get<2>(Edge2); const double Score1 = std::get<2>(Edge1); + const BinaryFunction *Func = BBToFuncMap[std::get<0>(Edge2)]; + if (opts::SkipNonSimple && !Func->isSimple()) + continue; outs() << "Edge (" << std::get<0>(Edge2)->getName() << " -> " << std::get<1>(Edge2)->getName() << ") in " - << BBToFuncMap[std::get<0>(Edge2)]->getDemangledName() + << Func->getDemangledName() << "\n\tScore bin1 = " << format("%.4f", Score1 * 100.0) << "%\n\tScore bin2 = " << format("%.4f", Score2 * 100.0); outs() << "%\t(Difference: "; @@ -537,6 +547,8 @@ class RewriteInstanceDiff { Score2 = LTOAggregatedScore2[Iter2->second]; if (Score1 == 0.0 || Score2 == 0.0) continue; + if (opts::SkipNonSimple && !Func1->isSimple() && !Func2->isSimple()) + continue; LargestDiffs.insert( std::make_pair<>(std::abs(Score1 - Score2), MapEntry)); ScoreMap[Func2] = std::make_pair<>(Score1, Score2); From 135ce2f820d881d5a7c5d90feab109174918a21f Mon Sep 17 00:00:00 2001 From: ruturaj4 Date: Sun, 21 May 2023 12:34:56 -0500 Subject: [PATCH 162/704] [clang][ExtractAPI] Modify declaration fragment methods to add a new fragment at an arbitrary offset. The current implementation doesn't support merging declaration fragments at arbitrary offsets. This patch adds that support by modifying declaration fragment methods. Differential Revision: https://reviews.llvm.org/D151048 --- .../clang/ExtractAPI/DeclarationFragments.h | 32 +++++++++++++------ .../clang/ExtractAPI/ExtractAPIVisitor.h | 15 +++++---- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/clang/include/clang/ExtractAPI/DeclarationFragments.h b/clang/include/clang/ExtractAPI/DeclarationFragments.h index 90121a138175c..0eb240d2b5930 100644 --- a/clang/include/clang/ExtractAPI/DeclarationFragments.h +++ b/clang/include/clang/ExtractAPI/DeclarationFragments.h @@ -99,25 +99,37 @@ class DeclarationFragments { const std::vector &getFragments() const { return Fragments; } - // Add a new Fragment to the beginning of the Fragments. - DeclarationFragments &appendFront(StringRef Spelling, FragmentKind Kind, - StringRef PreciseIdentifier = "", - const Decl *Declaration = nullptr) { - Fragments.emplace(Fragments.begin(), Spelling, Kind, PreciseIdentifier, - Declaration); + size_t calculateOffset(intmax_t Index) const { + if (Index >= 0) { + size_t offset = static_cast(Index); + if (offset > Fragments.size()) { + offset = Fragments.size(); + } + return offset; + } + return Fragments.size() + static_cast(Index); + } + + // Add a new Fragment at an arbitrary offset. + DeclarationFragments &insertAtIndex(intmax_t Index, StringRef Spelling, + FragmentKind Kind, + StringRef PreciseIdentifier = "", + const Decl *Declaration = nullptr) { + Fragments.insert( + Fragments.begin() + calculateOffset(Index), + std::move(Fragment(Spelling, Kind, PreciseIdentifier, Declaration))); return *this; } - DeclarationFragments &appendFront(DeclarationFragments &&Other) { - Fragments.insert(Fragments.begin(), + DeclarationFragments &insertAtIndex(intmax_t Index, + DeclarationFragments &&Other) { + Fragments.insert(Fragments.begin() + calculateOffset(Index), std::make_move_iterator(Other.Fragments.begin()), std::make_move_iterator(Other.Fragments.end())); Other.Fragments.clear(); return *this; } - void removeLast() { Fragments.pop_back(); } - /// Append a new Fragment to the end of the Fragments. /// /// \returns a reference to the DeclarationFragments object itself after diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h index 8b3721a4d7adb..1b82f2604403d 100644 --- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h +++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h @@ -110,15 +110,16 @@ template static void modifyRecords(const T &Records, const StringRef &Name) { for (const auto &Record : Records) { if (Name == Record.second.get()->Name) { - Record.second.get()->Declaration.removeLast(); Record.second.get() ->Declaration - .appendFront(" ", DeclarationFragments::FragmentKind::Text) - .appendFront("typedef", DeclarationFragments::FragmentKind::Keyword, - "", nullptr) - .append(" { ... } ", DeclarationFragments::FragmentKind::Text) - .append(Name, DeclarationFragments::FragmentKind::Identifier) - .append(";", DeclarationFragments::FragmentKind::Text); + .insertAtIndex(0, "typedef", + DeclarationFragments::FragmentKind::Keyword, "", + nullptr) + .insertAtIndex(1, " ", DeclarationFragments::FragmentKind::Text) + .insertAtIndex(-1, " { ... } ", + DeclarationFragments::FragmentKind::Text) + .insertAtIndex(-1, Name, + DeclarationFragments::FragmentKind::Identifier); break; } } From 92180dae775f66193069279ee7532735ea2fee06 Mon Sep 17 00:00:00 2001 From: ruturaj4 Date: Sat, 27 May 2023 18:54:56 -0500 Subject: [PATCH 163/704] [clang][ExtractAPI] Modify declaration fragment methods to add a new fragment at an arbitrary offset. --- .../clang/ExtractAPI/DeclarationFragments.h | 40 +++++++++---------- .../clang/ExtractAPI/ExtractAPIVisitor.h | 20 +++++----- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/clang/include/clang/ExtractAPI/DeclarationFragments.h b/clang/include/clang/ExtractAPI/DeclarationFragments.h index 0eb240d2b5930..4c1b830807047 100644 --- a/clang/include/clang/ExtractAPI/DeclarationFragments.h +++ b/clang/include/clang/ExtractAPI/DeclarationFragments.h @@ -97,34 +97,32 @@ class DeclarationFragments { Declaration(Declaration) {} }; + using FragmentIterator = std::vector::iterator; + using ConstFragmentIterator = std::vector::const_iterator; + const std::vector &getFragments() const { return Fragments; } - size_t calculateOffset(intmax_t Index) const { - if (Index >= 0) { - size_t offset = static_cast(Index); - if (offset > Fragments.size()) { - offset = Fragments.size(); - } - return offset; - } - return Fragments.size() + static_cast(Index); - } + FragmentIterator begin() { return Fragments.begin(); } + + FragmentIterator end() { return Fragments.end(); } + + ConstFragmentIterator cbegin() const { return Fragments.cbegin(); } + + ConstFragmentIterator cend() const { return Fragments.cend(); } // Add a new Fragment at an arbitrary offset. - DeclarationFragments &insertAtIndex(intmax_t Index, StringRef Spelling, - FragmentKind Kind, - StringRef PreciseIdentifier = "", - const Decl *Declaration = nullptr) { - Fragments.insert( - Fragments.begin() + calculateOffset(Index), - std::move(Fragment(Spelling, Kind, PreciseIdentifier, Declaration))); + DeclarationFragments &insert(FragmentIterator It, StringRef Spelling, + FragmentKind Kind, + StringRef PreciseIdentifier = "", + const Decl *Declaration = nullptr) { + Fragments.insert(It, std::move(Fragment(Spelling, Kind, PreciseIdentifier, + Declaration))); return *this; } - DeclarationFragments &insertAtIndex(intmax_t Index, - DeclarationFragments &&Other) { - Fragments.insert(Fragments.begin() + calculateOffset(Index), - std::make_move_iterator(Other.Fragments.begin()), + DeclarationFragments &insert(FragmentIterator It, + DeclarationFragments &&Other) { + Fragments.insert(It, std::make_move_iterator(Other.Fragments.begin()), std::make_move_iterator(Other.Fragments.end())); Other.Fragments.clear(); return *this; diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h index 1b82f2604403d..f0882afb5a61b 100644 --- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h +++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h @@ -110,16 +110,16 @@ template static void modifyRecords(const T &Records, const StringRef &Name) { for (const auto &Record : Records) { if (Name == Record.second.get()->Name) { - Record.second.get() - ->Declaration - .insertAtIndex(0, "typedef", - DeclarationFragments::FragmentKind::Keyword, "", - nullptr) - .insertAtIndex(1, " ", DeclarationFragments::FragmentKind::Text) - .insertAtIndex(-1, " { ... } ", - DeclarationFragments::FragmentKind::Text) - .insertAtIndex(-1, Name, - DeclarationFragments::FragmentKind::Identifier); + auto &DeclFragment = Record.second->Declaration; + DeclFragment.insert(DeclFragment.begin(), " ", + DeclarationFragments::FragmentKind::Text); + DeclFragment.insert(DeclFragment.begin(), "typedef", + DeclarationFragments::FragmentKind::Keyword, "", + nullptr); + DeclFragment.insert(--DeclFragment.end(), " { ... } ", + DeclarationFragments::FragmentKind::Text); + DeclFragment.insert(--DeclFragment.end(), Name, + DeclarationFragments::FragmentKind::Identifier); break; } } From 397f2e9ebee8d8e45547e90dd05228d7f965df67 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 30 May 2023 15:32:43 -0700 Subject: [PATCH 164/704] Remove llvm::Optional This is part of an effort to migrate from llvm::Optional to std::optional: https://discourse.llvm.org/t/deprecating-llvm-optional-x-hasvalue-getvalue-getvalueor/63716 Differential Revision: https://reviews.llvm.org/D149128 --- llvm/include/llvm/ADT/None.h | 31 ------------------------------- llvm/include/llvm/ADT/Optional.h | 27 --------------------------- mlir/include/mlir/Support/LLVM.h | 5 +---- 3 files changed, 1 insertion(+), 62 deletions(-) delete mode 100644 llvm/include/llvm/ADT/None.h delete mode 100644 llvm/include/llvm/ADT/Optional.h diff --git a/llvm/include/llvm/ADT/None.h b/llvm/include/llvm/ADT/None.h deleted file mode 100644 index c497821a696eb..0000000000000 --- a/llvm/include/llvm/ADT/None.h +++ /dev/null @@ -1,31 +0,0 @@ -//===-- None.h - Simple null value for implicit construction ------*- C++ -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file provides None, an enumerator for use in implicit constructors -/// of various (usually templated) types to make such construction more -/// terse. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_ADT_NONE_H -#define LLVM_ADT_NONE_H - -#include "llvm/Support/Compiler.h" -#include - -namespace llvm { -/// A simple null object to allow implicit construction of std::optional -/// and similar types without having to spell out the specialization's name. -LLVM_DEPRECATED("Use std::nullopt_t instead", "std::nullopt_t") -typedef std::nullopt_t NoneType; -LLVM_DEPRECATED("Use std::nullopt instead.", "std::nullopt") -inline constexpr std::nullopt_t None = std::nullopt; -} - -#endif diff --git a/llvm/include/llvm/ADT/Optional.h b/llvm/include/llvm/ADT/Optional.h deleted file mode 100644 index c3382837c0aea..0000000000000 --- a/llvm/include/llvm/ADT/Optional.h +++ /dev/null @@ -1,27 +0,0 @@ -//===- Optional.h - Simple variant for passing optional values --*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file provides Optional, a template class modeled in the spirit of -/// OCaml's 'opt' variant. The idea is to strongly type whether or not -/// a value can be optional. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_ADT_OPTIONAL_H -#define LLVM_ADT_OPTIONAL_H - -#include - -namespace llvm { -// Legacy alias of llvm::Optional to std::optional. -// FIXME: Remove this after LLVM 16. -template using Optional = std::optional; -} // namespace llvm - -#endif // LLVM_ADT_OPTIONAL_H diff --git a/mlir/include/mlir/Support/LLVM.h b/mlir/include/mlir/Support/LLVM.h index 216de42ad810e..e55ae06a8721e 100644 --- a/mlir/include/mlir/Support/LLVM.h +++ b/mlir/include/mlir/Support/LLVM.h @@ -18,9 +18,8 @@ #ifndef MLIR_SUPPORT_LLVM_H #define MLIR_SUPPORT_LLVM_H -// We include these two headers because they cannot be practically forward +// We include this header because it cannot be practically forward // declared, and are effectively language features. -#include "llvm/ADT/None.h" #include "llvm/Support/Casting.h" #include @@ -58,7 +57,6 @@ class DenseSet; class MallocAllocator; template class MutableArrayRef; -template using Optional = std::optional; template class PointerUnion; template @@ -130,7 +128,6 @@ using SetVector = llvm::SetVector; template using StringSet = llvm::StringSet; using llvm::MutableArrayRef; -using llvm::Optional; using llvm::PointerUnion; using llvm::SmallPtrSet; using llvm::SmallPtrSetImpl; From c7eb1b07470b9babfcd258f014df3661e5f84b30 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Tue, 30 May 2023 10:31:44 -0700 Subject: [PATCH 165/704] [lldb] Consult summary provider before printing children of root references When printing the root of a value, if it's a reference its children are unconditionally printed - in contrast to pointers whose children are only printed if a sufficient pointer depth is given. However, the children are printed even when there's a summary provider that says not to. If a summary provider exists, this change consults it to determine if children should be printed. For example, given a variable of type `std::string &`, this change has the following effect: Before: ``` (lldb) p string_ref (std::string &) string_ref = "one two three four five six seven eight nine ten": { __r_ = { std::__1::__compressed_pair_elem, std::__1::allocator >::__rep, 0, false> = { __value_ = { = { __l = (__data_ = "one two three four five six seven eight nine ten", __size_ = 48, __cap_ = 64, __is_long_ = 1) __s = (__data_ = "@\0p\U00000001\0`\0\00\0\0\0\0\0\0\0@", __padding_ = "\x80t<", __size_ = '\0', __is_long_ = '\x01') __r = { __words ={...} } } } } } } ``` After: ``` (lldb) p string_ref (std::string &) string_ref = "one two three four five six seven eight nine ten" ``` rdar://73248786 Differential Revision: https://reviews.llvm.org/D151748 --- .../DataFormatters/ValueObjectPrinter.cpp | 10 ++++--- .../root-reference-children/Makefile | 3 +++ .../TestRootReferenceChildren.py | 27 +++++++++++++++++++ .../root-reference-children/main.cpp | 24 +++++++++++++++++ 4 files changed, 60 insertions(+), 4 deletions(-) create mode 100644 lldb/test/API/functionalities/data-formatter/root-reference-children/Makefile create mode 100644 lldb/test/API/functionalities/data-formatter/root-reference-children/TestRootReferenceChildren.py create mode 100644 lldb/test/API/functionalities/data-formatter/root-reference-children/main.cpp diff --git a/lldb/source/DataFormatters/ValueObjectPrinter.cpp b/lldb/source/DataFormatters/ValueObjectPrinter.cpp index bde999a7a8bcf..fac319f67c805 100644 --- a/lldb/source/DataFormatters/ValueObjectPrinter.cpp +++ b/lldb/source/DataFormatters/ValueObjectPrinter.cpp @@ -516,11 +516,13 @@ bool ValueObjectPrinter::ShouldPrintChildren( if (m_options.m_pointer_as_array) return true; - TypeSummaryImpl *entry = GetSummaryFormatter(); - if (m_options.m_use_objc) return false; + bool print_children = true; + if (TypeSummaryImpl *type_summary = GetSummaryFormatter()) + print_children = type_summary->DoesPrintChildren(m_valobj); + if (is_failed_description || !HasReachedMaximumDepth()) { // We will show children for all concrete types. We won't show pointer // contents unless a pointer depth has been specified. We won't reference @@ -538,7 +540,7 @@ bool ValueObjectPrinter::ShouldPrintChildren( const bool is_root_level = m_curr_depth == 0; - if (is_ref && is_root_level) { + if (is_ref && is_root_level && print_children) { // If this is the root object (depth is zero) that we are showing and // it is a reference, and no pointer depth has been supplied print out // what it references. Don't do this at deeper depths otherwise we can @@ -549,7 +551,7 @@ bool ValueObjectPrinter::ShouldPrintChildren( return curr_ptr_depth.CanAllowExpansion(); } - return (!entry || entry->DoesPrintChildren(m_valobj) || m_summary.empty()); + return print_children || m_summary.empty(); } return false; } diff --git a/lldb/test/API/functionalities/data-formatter/root-reference-children/Makefile b/lldb/test/API/functionalities/data-formatter/root-reference-children/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/root-reference-children/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/functionalities/data-formatter/root-reference-children/TestRootReferenceChildren.py b/lldb/test/API/functionalities/data-formatter/root-reference-children/TestRootReferenceChildren.py new file mode 100644 index 0000000000000..5de66177e7cad --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/root-reference-children/TestRootReferenceChildren.py @@ -0,0 +1,27 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class TestCase(TestBase): + def test(self): + self.build() + lldbutil.run_to_source_breakpoint( + self, "break here", lldb.SBFileSpec("main.cpp") + ) + + self.dbg.HandleCommand( + f"type summary add --expand -s 'some summary' SummaryAndChildren" + ) + self.dbg.HandleCommand(f"type summary add -s 'some summary' SummaryOnly") + + self.expect( + "v summary_and_children_ref", substrs=["some summary", "child = 30"] + ) + self.expect( + "v summary_only_ref", patterns=["some summary", "(?s)^(?!.*child = )"] + ) + self.expect( + "v children_only_ref", patterns=["(?s)^(?!.*some summary)", "child = 30"] + ) diff --git a/lldb/test/API/functionalities/data-formatter/root-reference-children/main.cpp b/lldb/test/API/functionalities/data-formatter/root-reference-children/main.cpp new file mode 100644 index 0000000000000..da4d0d2128a46 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/root-reference-children/main.cpp @@ -0,0 +1,24 @@ +#include + +struct SummaryAndChildren { + int child = 30; +}; + +struct SummaryOnly { + int child = 30; +}; + +struct ChildrenOnly { + int child = 30; +}; + +int main() { + SummaryAndChildren summary_and_children; + SummaryOnly summary_only; + ChildrenOnly children_only; + auto &summary_and_children_ref = summary_and_children; + auto &summary_only_ref = summary_only; + auto &children_only_ref = children_only; + printf("break here\n"); + return 0; +} From 48a12ae8212c22d9d1d84270db659ac76ecfa972 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Tue, 30 May 2023 15:13:36 -0700 Subject: [PATCH 166/704] Fix a few bugs with Mach-O corefile loading, plus perf In ProcessMachCore::LoadBinariesViaMetadata(), if we did load some binaries via metadata in the core file, don't then search for a userland dyld in the corefile / kernel and throw away that binary list. Also fix a little bug with correctly recognizing corefiles using a `main bin spec` LC_NOTE that explicitly declare that this is a userland corefile. LocateSymbolFileMacOSX.cpp's Symbols::DownloadObjectAndSymbolFile clarify the comments on how the force_lookup and how the dbgshell_command local both have the same effect. In PlatformDarwinKernel::LoadPlatformBinaryAndSetup, don't log a message unless we actually found a kernel fileset. Reorganize ObjectFileMachO::LoadCoreFileImages so that it delegates binary searching to DynamicLoader::LoadBinaryWithUUIDAndAddress and doesn't duplicate those searches. For searches that fail, we would perform them multiple times in both methods. When we have the mach-o segment vmaddrs for a binary, don't let LoadBinaryWithUUIDAndAddress load the binary first at its mach-o header address in the Target; we'll load the segments at the correct addresses individually later in this method. DynamicLoaderDarwin::ImageInfo::PutToLog fix a LLDB_LOG logging formatter. In DynamicLoader::LoadBinaryWithUUIDAndAddress, instead of using Target::GetOrCreateModule as a way to find a binary already registered in lldb's global module cache (and implicitly add it to the Target image list), use ModuleList::GetSharedModule() which only searches the global module cache, don't add it to the Target. We may not want to add an unstripped binary to the Target. Add a call to Symbols::DownloadObjectAndSymbolFile() even if "force_symbol_search" isn't set -- this will turn into a DebugSymbols call / Spotlight search on a macOS system, which we want. Only set the Module's LoadAddress if the caller asked us to do that. Differential Revision: https://reviews.llvm.org/D150928 rdar://109186357 --- lldb/include/lldb/Target/DynamicLoader.h | 16 ++- lldb/source/Core/DynamicLoader.cpp | 81 +++++++++----- .../MacOSX-DYLD/DynamicLoaderDarwin.cpp | 2 +- .../ObjectFile/Mach-O/ObjectFileMachO.cpp | 103 ++++-------------- .../Platform/MacOSX/PlatformDarwinKernel.cpp | 6 +- .../Process/gdb-remote/ProcessGDBRemote.cpp | 11 +- .../Process/mach-core/ProcessMachCore.cpp | 19 ++-- lldb/source/Symbol/LocateSymbolFileMacOSX.cpp | 9 +- 8 files changed, 119 insertions(+), 128 deletions(-) diff --git a/lldb/include/lldb/Target/DynamicLoader.h b/lldb/include/lldb/Target/DynamicLoader.h index 22d047ab4b616..3aa92398d0130 100644 --- a/lldb/include/lldb/Target/DynamicLoader.h +++ b/lldb/include/lldb/Target/DynamicLoader.h @@ -256,11 +256,21 @@ class DynamicLoader : public PluginInterface { /// to the Target. The caller may prefer to batch up these when loading /// multiple binaries. /// + /// \param[in] set_address_in_target + /// Whether the address of the binary should be set in the Target if it + /// is added. The caller may want to set the section addresses + /// individually, instead of loading the binary the entire based on the + /// start address or slide. The caller is responsible for setting the + /// load address for the binary or its segments in the Target if it passes + /// true. + /// /// \return /// Returns a shared pointer for the Module that has been added. - static lldb::ModuleSP LoadBinaryWithUUIDAndAddress( - Process *process, llvm::StringRef name, UUID uuid, lldb::addr_t value, - bool value_is_offset, bool force_symbol_search, bool notify); + static lldb::ModuleSP + LoadBinaryWithUUIDAndAddress(Process *process, llvm::StringRef name, + UUID uuid, lldb::addr_t value, + bool value_is_offset, bool force_symbol_search, + bool notify, bool set_address_in_target); /// Get information about the shared cache for a process, if possible. /// diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp index 8849ccedbd481..2e5378f654a51 100644 --- a/lldb/source/Core/DynamicLoader.cpp +++ b/lldb/source/Core/DynamicLoader.cpp @@ -187,14 +187,13 @@ static ModuleSP ReadUnnamedMemoryModule(Process *process, addr_t addr, ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress( Process *process, llvm::StringRef name, UUID uuid, addr_t value, - bool value_is_offset, bool force_symbol_search, bool notify) { + bool value_is_offset, bool force_symbol_search, bool notify, + bool set_address_in_target) { ModuleSP memory_module_sp; ModuleSP module_sp; PlatformSP platform_sp = process->GetTarget().GetPlatform(); Target &target = process->GetTarget(); Status error; - ModuleSpec module_spec; - module_spec.GetUUID() = uuid; if (!uuid.IsValid() && !value_is_offset) { memory_module_sp = ReadUnnamedMemoryModule(process, value, name); @@ -202,23 +201,46 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress( if (memory_module_sp) uuid = memory_module_sp->GetUUID(); } + ModuleSpec module_spec; + module_spec.GetUUID() = uuid; + FileSpec name_filespec(name); + if (FileSystem::Instance().Exists(name_filespec)) + module_spec.GetFileSpec() = name_filespec; if (uuid.IsValid()) { - ModuleSpec module_spec; - module_spec.GetUUID() = uuid; - + // Has lldb already seen a module with this UUID? if (!module_sp) - module_sp = target.GetOrCreateModule(module_spec, false, &error); + error = ModuleList::GetSharedModule(module_spec, module_sp, nullptr, + nullptr, nullptr); + + // Can lldb's symbol/executable location schemes + // find an executable and symbol file. + if (!module_sp) { + FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths(); + module_spec.GetSymbolFileSpec() = + Symbols::LocateExecutableSymbolFile(module_spec, search_paths); + ModuleSpec objfile_module_spec = + Symbols::LocateExecutableObjectFile(module_spec); + module_spec.GetFileSpec() = objfile_module_spec.GetFileSpec(); + if (FileSystem::Instance().Exists(module_spec.GetFileSpec()) && + FileSystem::Instance().Exists(module_spec.GetSymbolFileSpec())) { + module_sp = std::make_shared(module_spec); + } + } // If we haven't found a binary, or we don't have a SymbolFile, see // if there is an external search tool that can find it. - if (force_symbol_search && - (!module_sp || !module_sp->GetSymbolFileFileSpec())) { - Symbols::DownloadObjectAndSymbolFile(module_spec, error, true); + if (!module_sp || !module_sp->GetSymbolFileFileSpec()) { + Symbols::DownloadObjectAndSymbolFile(module_spec, error, + force_symbol_search); if (FileSystem::Instance().Exists(module_spec.GetFileSpec())) { module_sp = std::make_shared(module_spec); } } + + // If we only found the executable, create a Module based on that. + if (!module_sp && FileSystem::Instance().Exists(module_spec.GetFileSpec())) + module_sp = std::make_shared(module_spec); } // If we couldn't find the binary anywhere else, as a last resort, @@ -239,25 +261,34 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress( target.GetImages().AppendIfNeeded(module_sp, false); bool changed = false; - if (module_sp->GetObjectFile()) { - if (value != LLDB_INVALID_ADDRESS) { - LLDB_LOGF(log, "Loading binary UUID %s at %s 0x%" PRIx64, - uuid.GetAsString().c_str(), - value_is_offset ? "offset" : "address", value); - module_sp->SetLoadAddress(target, value, value_is_offset, changed); + if (set_address_in_target) { + if (module_sp->GetObjectFile()) { + if (value != LLDB_INVALID_ADDRESS) { + LLDB_LOGF(log, + "DynamicLoader::LoadBinaryWithUUIDAndAddress Loading " + "binary UUID %s at %s 0x%" PRIx64, + uuid.GetAsString().c_str(), + value_is_offset ? "offset" : "address", value); + module_sp->SetLoadAddress(target, value, value_is_offset, changed); + } else { + // No address/offset/slide, load the binary at file address, + // offset 0. + LLDB_LOGF(log, + "DynamicLoader::LoadBinaryWithUUIDAndAddress Loading " + "binary UUID %s at file address", + uuid.GetAsString().c_str()); + module_sp->SetLoadAddress(target, 0, true /* value_is_slide */, + changed); + } } else { - // No address/offset/slide, load the binary at file address, - // offset 0. - LLDB_LOGF(log, "Loading binary UUID %s at file address", - uuid.GetAsString().c_str()); + // In-memory image, load at its true address, offset 0. + LLDB_LOGF(log, + "DynamicLoader::LoadBinaryWithUUIDAndAddress Loading binary " + "UUID %s from memory at address 0x%" PRIx64, + uuid.GetAsString().c_str(), value); module_sp->SetLoadAddress(target, 0, true /* value_is_slide */, changed); } - } else { - // In-memory image, load at its true address, offset 0. - LLDB_LOGF(log, "Loading binary UUID %s from memory at address 0x%" PRIx64, - uuid.GetAsString().c_str(), value); - module_sp->SetLoadAddress(target, 0, true /* value_is_slide */, changed); } if (notify) { diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp index 4aaf0140fe75d..0230ae23f4a0b 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp @@ -808,7 +808,7 @@ void DynamicLoaderDarwin::ImageInfo::PutToLog(Log *log) const { LLDB_LOG(log, "uuid={1} path='{2}' (UNLOADED)", uuid.GetAsString(), file_spec.GetPath()); } else { - LLDB_LOG(log, "address={0:x+16} uuid={2} path='{3}'", address, + LLDB_LOG(log, "address={0:x+16} uuid={1} path='{2}'", address, uuid.GetAsString(), file_spec.GetPath()); for (uint32_t i = 0; i < segments.size(); ++i) segments[i].PutToLog(log, slide); diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index d78ed63b0e42d..887c0b64c2c62 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -6878,62 +6878,22 @@ bool ObjectFileMachO::LoadCoreFileImages(lldb_private::Process &process) { continue; } - // If this binary is currently executing, we want to force a - // possibly expensive search for the binary and its dSYM. - if (image.currently_executing && image.uuid.IsValid()) { - ModuleSpec module_spec; - module_spec.GetUUID() = image.uuid; - Symbols::DownloadObjectAndSymbolFile(module_spec, error, true); - if (FileSystem::Instance().Exists(module_spec.GetFileSpec())) { - module_sp = process.GetTarget().GetOrCreateModule(module_spec, false); - process.GetTarget().GetImages().AppendIfNeeded(module_sp, - false /* notify */); - } - } - - // We have an address, that's the best way to discover the binary. - if (!module_sp && image.load_address != LLDB_INVALID_ADDRESS) { - module_sp = DynamicLoader::LoadBinaryWithUUIDAndAddress( - &process, image.filename, image.uuid, image.load_address, - false /* value_is_offset */, image.currently_executing, - false /* notify */); - if (module_sp) { - // We've already set the load address in the Target, - // don't do any more processing on this module. - added_modules.Append(module_sp, false /* notify */); - continue; - } + bool value_is_offset = image.load_address == LLDB_INVALID_ADDRESS; + uint64_t value = value_is_offset ? image.slide : image.load_address; + if (value_is_offset && value == LLDB_INVALID_ADDRESS) { + // We have neither address nor slide; so we will find the binary + // by UUID and load it at slide/offset 0. + value = 0; } - // If we have a slide, we need to find the original binary - // by UUID, then we can apply the slide value. - if (!module_sp && image.uuid.IsValid() && - image.slide != LLDB_INVALID_ADDRESS) { + // We have either a UUID, or we have a load address which + // and can try to read load commands and find a UUID. + if (image.uuid.IsValid() || + (!value_is_offset && value != LLDB_INVALID_ADDRESS)) { + const bool set_load_address = image.segment_load_addresses.size() == 0; module_sp = DynamicLoader::LoadBinaryWithUUIDAndAddress( - &process, image.filename, image.uuid, image.slide, - true /* value_is_offset */, image.currently_executing, - false /* notify */); - if (module_sp) { - // We've already set the load address in the Target, - // don't do any more processing on this module. - added_modules.Append(module_sp, false /* notify */); - continue; - } - } - - // Try to find the binary by UUID or filename on the local - // filesystem or in lldb's global module cache. - if (!module_sp) { - Status error; - ModuleSpec module_spec; - if (image.uuid.IsValid()) - module_spec.GetUUID() = image.uuid; - if (!image.filename.empty()) - module_spec.GetFileSpec() = FileSpec(image.filename.c_str()); - module_sp = - process.GetTarget().GetOrCreateModule(module_spec, false, &error); - process.GetTarget().GetImages().AppendIfNeeded(module_sp, - false /* notify */); + &process, image.filename, image.uuid, value, value_is_offset, + image.currently_executing, false /* notify */, set_load_address); } // We have a ModuleSP to load in the Target. Load it at the @@ -6947,7 +6907,8 @@ bool ObjectFileMachO::LoadCoreFileImages(lldb_private::Process &process) { std::string uuidstr = image.uuid.GetAsString(); log->Printf("ObjectFileMachO::LoadCoreFileImages adding binary '%s' " "UUID %s with section load addresses", - image.filename.c_str(), uuidstr.c_str()); + module_sp->GetFileSpec().GetPath().c_str(), + uuidstr.c_str()); } for (auto name_vmaddr_tuple : image.segment_load_addresses) { SectionList *sectlist = module_sp->GetObjectFile()->GetSectionList(); @@ -6960,39 +6921,17 @@ bool ObjectFileMachO::LoadCoreFileImages(lldb_private::Process &process) { } } } - } else if (image.load_address != LLDB_INVALID_ADDRESS) { - if (log) { - std::string uuidstr = image.uuid.GetAsString(); - log->Printf("ObjectFileMachO::LoadCoreFileImages adding binary '%s' " - "UUID %s with load address 0x%" PRIx64, - image.filename.c_str(), uuidstr.c_str(), - image.load_address); - } - const bool address_is_slide = false; - bool changed = false; - module_sp->SetLoadAddress(process.GetTarget(), image.load_address, - address_is_slide, changed); - } else if (image.slide != 0) { - if (log) { - std::string uuidstr = image.uuid.GetAsString(); - log->Printf("ObjectFileMachO::LoadCoreFileImages adding binary '%s' " - "UUID %s with slide amount 0x%" PRIx64, - image.filename.c_str(), uuidstr.c_str(), image.slide); - } - const bool address_is_slide = true; - bool changed = false; - module_sp->SetLoadAddress(process.GetTarget(), image.slide, - address_is_slide, changed); } else { if (log) { std::string uuidstr = image.uuid.GetAsString(); log->Printf("ObjectFileMachO::LoadCoreFileImages adding binary '%s' " - "UUID %s at its file address, no slide applied", - image.filename.c_str(), uuidstr.c_str()); + "UUID %s with %s 0x%" PRIx64, + module_sp->GetFileSpec().GetPath().c_str(), + uuidstr.c_str(), + value_is_offset ? "slide" : "load address", value); } - const bool address_is_slide = true; - bool changed = false; - module_sp->SetLoadAddress(process.GetTarget(), 0, address_is_slide, + bool changed; + module_sp->SetLoadAddress(process.GetTarget(), value, value_is_offset, changed); } } diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp index d120ae05c82bc..dd2ec906cd007 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp @@ -952,14 +952,14 @@ bool PlatformDarwinKernel::LoadPlatformBinaryAndSetup(Process *process, addr_t actual_address = find_kernel_in_macho_fileset(process, input_addr); + if (actual_address == LLDB_INVALID_ADDRESS) + return false; + LLDB_LOGF(log, "PlatformDarwinKernel::%s check address 0x%" PRIx64 " for " "a macho fileset, got back kernel address 0x%" PRIx64, __FUNCTION__, input_addr, actual_address); - if (actual_address == LLDB_INVALID_ADDRESS) - return false; - // We have a xnu kernel binary, this is a kernel debug session. // Set the Target's Platform to be PlatformDarwinKernel, and the // Process' DynamicLoader to be DynamicLoaderDarwinKernel. diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index d20a02211f884..23b9cfdcce163 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -995,9 +995,11 @@ void ProcessGDBRemote::LoadStubBinaries() { if (standalone_uuid.IsValid()) { const bool force_symbol_search = true; const bool notify = true; + const bool set_address_in_target = true; DynamicLoader::LoadBinaryWithUUIDAndAddress( this, "", standalone_uuid, standalone_value, - standalone_value_is_offset, force_symbol_search, notify); + standalone_value_is_offset, force_symbol_search, notify, + set_address_in_target); } } @@ -1025,10 +1027,11 @@ void ProcessGDBRemote::LoadStubBinaries() { continue; const bool force_symbol_search = true; + const bool set_address_in_target = true; // Second manually load this binary into the Target. - DynamicLoader::LoadBinaryWithUUIDAndAddress(this, llvm::StringRef(), uuid, - addr, value_is_slide, - force_symbol_search, notify); + DynamicLoader::LoadBinaryWithUUIDAndAddress( + this, llvm::StringRef(), uuid, addr, value_is_slide, + force_symbol_search, notify, set_address_in_target); } } } diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp index 9bae7b0df5281..40918dba48905 100644 --- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp +++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp @@ -252,20 +252,20 @@ void ProcessMachCore::LoadBinariesViaMetadata() { m_mach_kernel_addr = objfile_binary_value; m_dyld_plugin_name = DynamicLoaderDarwinKernel::GetPluginNameStatic(); found_main_binary_definitively = true; + } else if (type == ObjectFile::eBinaryTypeUser) { + m_dyld_addr = objfile_binary_value; + m_dyld_plugin_name = DynamicLoaderMacOSXDYLD::GetPluginNameStatic(); } else { const bool force_symbol_search = true; const bool notify = true; + const bool set_address_in_target = true; if (DynamicLoader::LoadBinaryWithUUIDAndAddress( this, llvm::StringRef(), objfile_binary_uuid, objfile_binary_value, objfile_binary_value_is_offset, - force_symbol_search, notify)) { + force_symbol_search, notify, set_address_in_target)) { found_main_binary_definitively = true; m_dyld_plugin_name = DynamicLoaderStatic::GetPluginNameStatic(); } - if (type == ObjectFile::eBinaryTypeUser) { - m_dyld_addr = objfile_binary_value; - m_dyld_plugin_name = DynamicLoaderMacOSXDYLD::GetPluginNameStatic(); - } } } @@ -314,9 +314,11 @@ void ProcessMachCore::LoadBinariesViaMetadata() { const bool value_is_offset = false; const bool force_symbol_search = true; const bool notify = true; + const bool set_address_in_target = true; if (DynamicLoader::LoadBinaryWithUUIDAndAddress( this, llvm::StringRef(), ident_uuid, ident_binary_addr, - value_is_offset, force_symbol_search, notify)) { + value_is_offset, force_symbol_search, notify, + set_address_in_target)) { found_main_binary_definitively = true; m_dyld_plugin_name = DynamicLoaderStatic::GetPluginNameStatic(); } @@ -325,7 +327,10 @@ void ProcessMachCore::LoadBinariesViaMetadata() { // Finally, load any binaries noted by "load binary" LC_NOTEs in the // corefile - core_objfile->LoadCoreFileImages(*this); + if (core_objfile->LoadCoreFileImages(*this)) { + found_main_binary_definitively = true; + m_dyld_plugin_name = DynamicLoaderStatic::GetPluginNameStatic(); + } // LoadCoreFileImges may have set the dynamic loader, e.g. in // PlatformDarwinKernel::LoadPlatformBinaryAndSetup(). diff --git a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp index 7d24905be3504..8c52df5f2a0a7 100644 --- a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp +++ b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp @@ -559,14 +559,17 @@ bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, const UUID *uuid_ptr = module_spec.GetUUIDPtr(); const FileSpec *file_spec_ptr = module_spec.GetFileSpecPtr(); + // If \a dbgshell_command is set, the user has specified + // forced symbol lookup via that command. We'll get the + // path back from GetDsymForUUIDExecutable() later. llvm::StringRef dbgshell_command = GetDbgShellCommand(); - // When dbgshell_command is empty, the user has not enabled the use of an - // external program to find the symbols, don't run it for them. + // If forced lookup isn't set, by the user's \a dbgshell_command or + // by the \a force_lookup argument, exit this method. if (!force_lookup && dbgshell_command.empty()) return false; - // We need a UUID or valid (existing FileSpec. + // We need a UUID or valid existing FileSpec. if (!uuid_ptr && (!file_spec_ptr || !FileSystem::Instance().Exists(*file_spec_ptr))) return false; From 061a839033dc5f11c4e43fb64ed49cc85e1e5f32 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Wed, 24 May 2023 10:30:49 -0700 Subject: [PATCH 167/704] [lldb] Prevent dwim-print from showing kNoResult error Expression evaluation for `void` valued expressions sets an error using the `kNoResult` code. Like the `expression` command, `dwim-print` should also not print such errors. Before: ``` (lldb) dwim-print (void)printf("hi\n") hi Error: 'unknown error' ``` After: ``` (lldb) dwim-print (void)printf("hi\n") hi ``` rdar://109746544 Differential Revision: https://reviews.llvm.org/D151351 --- lldb/source/Commands/CommandObjectDWIMPrint.cpp | 4 +++- lldb/test/API/commands/dwim-print/TestDWIMPrint.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp index 8fc702a1a220e..7cb95fd622ba1 100644 --- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp +++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp @@ -11,6 +11,7 @@ #include "lldb/Core/ValueObject.h" #include "lldb/DataFormatters/DumpValueObjectOptions.h" #include "lldb/Expression/ExpressionVariable.h" +#include "lldb/Expression/UserExpression.h" #include "lldb/Interpreter/CommandInterpreter.h" #include "lldb/Interpreter/CommandObject.h" #include "lldb/Interpreter/CommandReturnObject.h" @@ -135,7 +136,8 @@ bool CommandObjectDWIMPrint::DoExecute(StringRef command, expr); } - valobj_sp->Dump(result.GetOutputStream(), dump_options); + if (valobj_sp->GetError().GetError() != UserExpression::kNoResult) + valobj_sp->Dump(result.GetOutputStream(), dump_options); if (suppress_result) if (auto result_var_sp = diff --git a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py index f2799ef53d49c..9cb99a2a817d6 100644 --- a/lldb/test/API/commands/dwim-print/TestDWIMPrint.py +++ b/lldb/test/API/commands/dwim-print/TestDWIMPrint.py @@ -138,3 +138,9 @@ def test_summary_strings(self): self.runCmd("type summary add -e -s 'stub summary' Structure") self._expect_cmd(f"dwim-print s", "frame variable") self._expect_cmd(f"dwim-print (struct Structure)s", "expression") + + def test_void_result(self): + """Test dwim-print does not surface an error message for void expressions.""" + self.build() + lldbutil.run_to_source_breakpoint(self, "// break here", lldb.SBFileSpec("main.c")) + self.expect("dwim-print (void)15", matching=False, patterns=["(?i)error"]) From 540d5e0ce66cefb072ab8f22df62468357c9ed0f Mon Sep 17 00:00:00 2001 From: wren romano <2998727+wrengr@users.noreply.github.com> Date: Tue, 30 May 2023 13:53:27 -0700 Subject: [PATCH 168/704] [mlir][sparse] Updating STEA parser/printer to use the name "dimSlices" Depends On D151505 Reviewed By: Peiming Differential Revision: https://reviews.llvm.org/D151513 --- .../SparseTensor/IR/SparseTensorAttrDefs.td | 5 +---- .../SparseTensor/IR/SparseTensorDialect.cpp | 12 ++++++------ .../SparseTensor/convert_sparse2sparse.mlir | 2 +- mlir/test/Dialect/SparseTensor/invalid.mlir | 6 +++--- .../Dialect/SparseTensor/invalid_encoding.mlir | 2 +- .../test/Dialect/SparseTensor/pre_rewriting.mlir | 2 +- mlir/test/Dialect/SparseTensor/roundtrip.mlir | 10 +++++----- .../Dialect/SparseTensor/roundtrip_encoding.mlir | 12 ++++++------ .../SparseTensor/sparse_extract_slice.mlir | 2 +- .../Dialect/SparseTensor/sparse_foreach.mlir | 4 ++-- .../SparseTensor/CPU/sparse_foreach_slices.mlir | 8 ++++---- .../SparseTensor/CPU/sparse_matmul_slice.mlir | 16 ++++++++-------- 12 files changed, 39 insertions(+), 42 deletions(-) diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td index e49d7be36620c..f0a502e5dcd9c 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td @@ -244,7 +244,7 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", // offset = 0, size = 8, and a dynamic stride on the second dimension). #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (0, 4, 1), (0, 8, ?) ] + dimSlices = [ (0, 4, 1), (0, 8, ?) ] }> ... tensor ... @@ -266,9 +266,6 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", // The required bitwidth for coordinate storage. "unsigned":$crdWidth, // A slice attribute for each dimension of the tensor type. - // FIXME: The name used here is `dimSlices`, however the - // parser/printer uses the name `slice` instead. Therefore - // the parser/printer need to be updated to match. ArrayRefParameter< "::mlir::sparse_tensor::SparseTensorDimSliceAttr", "per dimension slice metadata" diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index ae4198f5dce69..962e0ac21c637 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -408,7 +408,7 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) { // Process the data from the parsed dictionary value into struct-like data. SmallVector lvlTypes; - SmallVector slices; + SmallVector dimSlices; AffineMap dimToLvl = {}; unsigned posWidth = 0; unsigned crdWidth = 0; @@ -416,7 +416,7 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) { StringRef attrName; // Exactly 6 keys. SmallVector keys = {"lvlTypes", "dimToLvl", "posWidth", - "crdWidth", "slice"}; + "crdWidth", "dimSlices"}; while (succeeded(parser.parseOptionalKeyword(&attrName))) { if (!llvm::is_contained(keys, attrName)) { parser.emitError(parser.getNameLoc(), "unexpected key: ") << attrName; @@ -464,13 +464,13 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) { auto intAttr = llvm::dyn_cast(attr); ERROR_IF(!intAttr, "expected an integral index bitwidth") crdWidth = intAttr.getInt(); - } else if (attrName == "slice") { + } else if (attrName == "dimSlices") { RETURN_ON_FAIL(parser.parseLSquare()) // Dispatches to DimSliceAttr to skip mnemonic bool finished = false; while (auto attr = SparseTensorDimSliceAttr::parse(parser, nullptr)) { auto sliceAttr = llvm::cast(attr); - slices.push_back(sliceAttr); + dimSlices.push_back(sliceAttr); if (parser.parseOptionalComma().failed()) { finished = true; break; @@ -494,7 +494,7 @@ Attribute SparseTensorEncodingAttr::parse(AsmParser &parser, Type type) { // Construct struct-like storage for attribute. return parser.getChecked( - parser.getContext(), lvlTypes, dimToLvl, posWidth, crdWidth, slices); + parser.getContext(), lvlTypes, dimToLvl, posWidth, crdWidth, dimSlices); } void SparseTensorEncodingAttr::print(AsmPrinter &printer) const { @@ -512,7 +512,7 @@ void SparseTensorEncodingAttr::print(AsmPrinter &printer) const { if (getCrdWidth()) printer << ", crdWidth = " << getCrdWidth(); if (!getDimSlices().empty()) { - printer << ", slice = [ "; + printer << ", dimSlices = [ "; llvm::interleaveComma(getDimSlices(), printer, [&](SparseTensorDimSliceAttr attr) { // Calls SparseTensorDimSliceAttr::print directly to diff --git a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir index 26f41e142b8b7..fd612d5f597d5 100644 --- a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir +++ b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir @@ -41,7 +41,7 @@ #COOSlice = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ], - slice = [ (2, 2, 1), (12, 13, 1) ] + dimSlices = [ (2, 2, 1), (12, 13, 1) ] }> // CHECK-LABEL: func @sparse_nop_convert( diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir index c1e8afd9206ba..7a6c4824aabed 100644 --- a/mlir/test/Dialect/SparseTensor/invalid.mlir +++ b/mlir/test/Dialect/SparseTensor/invalid.mlir @@ -202,7 +202,7 @@ func.func @mismatch_values_types(%arg0: tensor) -> memref< #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (1, 4, 1), (1, 4, 2) ] + dimSlices = [ (1, 4, 1), (1, 4, 2) ] }> func.func @sparse_slice_offset(%arg0: tensor<2x8xf64, #CSR_SLICE>) -> index { @@ -215,7 +215,7 @@ func.func @sparse_slice_offset(%arg0: tensor<2x8xf64, #CSR_SLICE>) -> index { #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (1, 4, 1), (1, 4, 2) ] + dimSlices = [ (1, 4, 1), (1, 4, 2) ] }> func.func @sparse_slice_stride(%arg0: tensor<2x8xf64, #CSR_SLICE>) -> index { @@ -401,7 +401,7 @@ func.func @invalid_out_dense(%arg0: tensor<10xf64>, %arg1: !llvm.ptr) { #CSR = #sparse_tensor.encoding<{ lvlTypes = ["dense", "compressed"], - slice = [ (1, 4, 1), (1, 4, 2) ] + dimSlices = [ (1, 4, 1), (1, 4, 2) ] }> func.func @sparse_convert_to_slice(%arg0: tensor<10x?xf32>) -> tensor<10x10xf32, #CSR> { diff --git a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir index 91c3ef7b6d62d..e76df6551c2e1 100644 --- a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir +++ b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir @@ -66,6 +66,6 @@ func.func private @tensor_invalid_key(%arg0: tensor<16x32xf32, #a>) -> () #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (-1, ?, 1), (?, 4, 2) ] // expected-error{{expect positive value or ? for slice offset/size/stride}} + dimSlices = [ (-1, ?, 1), (?, 4, 2) ] // expected-error{{expect positive value or ? for slice offset/size/stride}} }> func.func private @sparse_slice(tensor) diff --git a/mlir/test/Dialect/SparseTensor/pre_rewriting.mlir b/mlir/test/Dialect/SparseTensor/pre_rewriting.mlir index d35296b924739..8aed1d6d205bd 100644 --- a/mlir/test/Dialect/SparseTensor/pre_rewriting.mlir +++ b/mlir/test/Dialect/SparseTensor/pre_rewriting.mlir @@ -10,7 +10,7 @@ #Slice = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ], - slice = [ (?, 1, 1), (?, 3, 1) ] + dimSlices = [ (?, 1, 1), (?, 3, 1) ] }> // CHECK-LABEL: func @sparse_nop_cast( diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir index 57dff1e53edc3..43429f454e122 100644 --- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir +++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir @@ -144,7 +144,7 @@ func.func @sparse_values(%arg0: tensor<128xf64, #SparseVector>) -> memref #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (1, 4, 1), (1, 4, 2) ] + dimSlices = [ (1, 4, 1), (1, 4, 2) ] }> // CHECK-LABEL: func @sparse_slice_offset( @@ -160,7 +160,7 @@ func.func @sparse_slice_offset(%arg0: tensor<2x8xf64, #CSR_SLICE>) -> index { #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (1, 4, 1), (1, 4, 2) ] + dimSlices = [ (1, 4, 1), (1, 4, 2) ] }> // CHECK-LABEL: func @sparse_slice_stride( @@ -189,7 +189,7 @@ func.func @sparse_metadata_init() -> !sparse_tensor.storage_specifier<#SparseVec #SparseVector = #sparse_tensor.encoding<{lvlTypes = ["compressed"]}> #SparseVector_Slice = #sparse_tensor.encoding<{ lvlTypes = ["compressed"], - slice = [ (?, ?, ?) ] + dimSlices = [ (?, ?, ?) ] }> // CHECK-LABEL: func @sparse_metadata_init( @@ -221,7 +221,7 @@ func.func @sparse_get_md(%arg0: !sparse_tensor.storage_specifier<#SparseVector>) #SparseVector_Slice = #sparse_tensor.encoding<{ lvlTypes = ["compressed"], - slice = [ (?, ?, ?) ] + dimSlices = [ (?, ?, ?) ] }> // CHECK-LABEL: func @sparse_get_md( @@ -238,7 +238,7 @@ func.func @sparse_get_md(%arg0: !sparse_tensor.storage_specifier<#SparseVector_S #SparseVector = #sparse_tensor.encoding<{ lvlTypes = ["compressed"], - slice = [ (?, ?, ?) ] + dimSlices = [ (?, ?, ?) ] }> // CHECK-LABEL: func @sparse_get_md( diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir index 4a7cd76ac489f..75f8d071fcfc0 100644 --- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir +++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir @@ -100,31 +100,31 @@ func.func private @sparse_ell(tensor) #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (1, 4, 1), (1, 4, 2) ] + dimSlices = [ (1, 4, 1), (1, 4, 2) ] }> // CHECK-LABEL: func private @sparse_slice( -// CHECK-SAME: tensor> +// CHECK-SAME: tensor> func.func private @sparse_slice(tensor) // ----- #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (1, 4, 1), (1, 4, 2) ] + dimSlices = [ (1, 4, 1), (1, 4, 2) ] }> // CHECK-LABEL: func private @sparse_slice( -// CHECK-SAME: tensor> +// CHECK-SAME: tensor> func.func private @sparse_slice(tensor) // ----- #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (1, ?, 1), (?, 4, 2) ] + dimSlices = [ (1, ?, 1), (?, 4, 2) ] }> // CHECK-LABEL: func private @sparse_slice( -// CHECK-SAME: tensor> +// CHECK-SAME: tensor> func.func private @sparse_slice(tensor) diff --git a/mlir/test/Dialect/SparseTensor/sparse_extract_slice.mlir b/mlir/test/Dialect/SparseTensor/sparse_extract_slice.mlir index 8cf8c6c89b63c..efb920b7af13c 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_extract_slice.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_extract_slice.mlir @@ -6,7 +6,7 @@ #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (0, 4, 1), (0, 8, 1) ] + dimSlices = [ (0, 4, 1), (0, 8, 1) ] }> // CHECK-LABEL: func.func @sparse_slice( diff --git a/mlir/test/Dialect/SparseTensor/sparse_foreach.mlir b/mlir/test/Dialect/SparseTensor/sparse_foreach.mlir index 8038e141662e5..339c94d0f78e1 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_foreach.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_foreach.mlir @@ -30,12 +30,12 @@ func.func @sparse_foreach_constant() -> () { #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - slice = [ (0, 4, 1), (2, 4, 1) ] + dimSlices = [ (0, 4, 1), (2, 4, 1) ] }> #CSR_SLICE_DYN = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - slice = [ (?, ?, ?), (?, ?, ?) ] + dimSlices = [ (?, ?, ?), (?, ?, ?) ] }> diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_foreach_slices.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_foreach_slices.mlir index 43b75f8aa2fe2..fc259b255c456 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_foreach_slices.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_foreach_slices.mlir @@ -16,12 +16,12 @@ #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (1, 4, 1), (1, 4, 2) ] + dimSlices = [ (1, 4, 1), (1, 4, 2) ] }> #CSR_SLICE_DYN = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (?, ?, ?), (?, ?, ?) ] + dimSlices = [ (?, ?, ?), (?, ?, ?) ] }> #COO = #sparse_tensor.encoding<{ @@ -30,12 +30,12 @@ #COO_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ], - slice = [ (1, 4, 1), (1, 4, 2) ] + dimSlices = [ (1, 4, 1), (1, 4, 2) ] }> #COO_SLICE_DYN = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ], - slice = [ (?, ?, ?), (?, ?, ?) ] + dimSlices = [ (?, ?, ?), (?, ?, ?) ] }> diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir index c5d6032db0e65..c9723070dd18d 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul_slice.mlir @@ -16,7 +16,7 @@ #DCSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - slice = [ (0, 4, 1), (0, 8, 1) ] + dimSlices = [ (0, 4, 1), (0, 8, 1) ] }> #CSR = #sparse_tensor.encoding<{ @@ -25,7 +25,7 @@ #CSR_SLICE = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (0, 4, 1), (0, 8, 1) ] + dimSlices = [ (0, 4, 1), (0, 8, 1) ] }> #COO = #sparse_tensor.encoding<{ @@ -34,32 +34,32 @@ #CSR_SLICE_1 = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (0, 4, 2), (0, 4, 1) ] + dimSlices = [ (0, 4, 2), (0, 4, 1) ] }> #DCSR_SLICE_1 = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - slice = [ (0, 4, 2), (1, 4, 1) ] + dimSlices = [ (0, 4, 2), (1, 4, 1) ] }> #COO_SLICE_1 = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ], - slice = [ (0, 4, 2), (0, 4, 1) ] + dimSlices = [ (0, 4, 2), (0, 4, 1) ] }> #COO_SLICE_2 = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton" ], - slice = [ (0, 4, 2), (1, 4, 1) ] + dimSlices = [ (0, 4, 2), (1, 4, 1) ] }> #CSR_SLICE_dyn = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], - slice = [ (?, 4, ?), (?, 4, ?) ] + dimSlices = [ (?, 4, ?), (?, 4, ?) ] }> #DCSR_SLICE_dyn = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ], - slice = [ (?, 4, ?), (?, 4, ?) ] + dimSlices = [ (?, 4, ?), (?, 4, ?) ] }> module { From af2bec7c4a967c9e2e009cdbc4470eb5ba8332f6 Mon Sep 17 00:00:00 2001 From: wren romano <2998727+wrengr@users.noreply.github.com> Date: Tue, 30 May 2023 14:16:17 -0700 Subject: [PATCH 169/704] [mlir][sparse] Adding new STEA::{with,without}DimSlices factories (These factories are used in downstream code, despite not being used within the MLIR codebase.) Depends On D151513 Reviewed By: Peiming Differential Revision: https://reviews.llvm.org/D151518 --- .../Dialect/SparseTensor/IR/SparseTensorAttrDefs.td | 8 ++++++++ .../mlir/Dialect/SparseTensor/IR/SparseTensorType.h | 9 +++++++++ .../Dialect/SparseTensor/IR/SparseTensorDialect.cpp | 11 +++++++++++ .../SparseTensor/Transforms/SparseTensorCodegen.cpp | 5 +---- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td index f0a502e5dcd9c..9fe425a40415b 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td @@ -304,6 +304,14 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding", /// reset to the default, and all other fields inherited from `this`. SparseTensorEncodingAttr withoutBitWidths() const; + /// Constructs a new encoding with the given dimSlices, and all + /// other fields inherited from `this`. + SparseTensorEncodingAttr withDimSlices(ArrayRef<::mlir::sparse_tensor::SparseTensorDimSliceAttr> dimSlices) const; + + /// Constructs a new encoding with the dimSlices reset to the default, + /// and all other fields inherited from `this`. + SparseTensorEncodingAttr withoutDimSlices() const; + // // Rank methods. // diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h index 6cae09db36cc1..cfc3374148f95 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h @@ -111,6 +111,15 @@ class SparseTensorType { return withEncoding(enc.withoutBitWidths()); } + SparseTensorType + withDimSlices(ArrayRef dimSlices) const { + return withEncoding(enc.withDimSlices(dimSlices)); + } + + SparseTensorType withoutDimSlices() const { + return withEncoding(enc.withoutDimSlices()); + } + // // Other methods. // diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 962e0ac21c637..a1eda8968a551 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -291,6 +291,17 @@ SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutBitWidths() const { return withBitWidths(0, 0); } +SparseTensorEncodingAttr SparseTensorEncodingAttr::withDimSlices( + ArrayRef dimSlices) const { + return SparseTensorEncodingAttr::get(getContext(), getLvlTypes(), + getDimToLvl(), getPosWidth(), + getCrdWidth(), dimSlices); +} + +SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutDimSlices() const { + return withDimSlices(ArrayRef{}); +} + bool SparseTensorEncodingAttr::isAllDense() const { return !getImpl() || llvm::all_of(getLvlTypes(), isDenseDLT); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp index f84009c4b63bd..a7f37e8189ea0 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp @@ -1138,10 +1138,7 @@ class SparseExtractSliceConverter // TODO: We should check these in ExtractSliceOp::verify. if (!srcEnc || !dstEnc || !dstEnc.isSlice()) return failure(); - assert(srcEnc.getLvlTypes() == dstEnc.getLvlTypes()); - assert(srcEnc.getDimToLvl() == dstEnc.getDimToLvl()); - assert(srcEnc.getPosWidth() == dstEnc.getPosWidth()); - assert(srcEnc.getCrdWidth() == dstEnc.getCrdWidth()); + assert(srcEnc.withoutDimSlices() == dstEnc.withoutDimSlices()); SmallVector fields; auto desc = getMutDescriptorFromTensorTuple(adaptor.getSource(), fields); From f58e67dee9355f54a88aa866b355c298317c3272 Mon Sep 17 00:00:00 2001 From: wren romano <2998727+wrengr@users.noreply.github.com> Date: Tue, 30 May 2023 13:47:22 -0700 Subject: [PATCH 170/704] [mlir][sparse] Removing unused helper function Depends On D151505 Reviewed By: aartbik, Peiming Differential Revision: https://reviews.llvm.org/D151522 --- .../Dialect/SparseTensor/IR/SparseTensorDialect.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index a1eda8968a551..7f8dcba77fc8e 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -31,17 +31,6 @@ using namespace mlir; using namespace mlir::sparse_tensor; -//===----------------------------------------------------------------------===// -// Additional convenience methods. -//===----------------------------------------------------------------------===// - -/// Gets the dimension-rank of the type of some `T`. (In particular -/// this is only used for `Value` and `TypedValue`.) -template -static inline Dimension getDimRank(T t) { - return getRankedTensorType(t).getRank(); -} - //===----------------------------------------------------------------------===// // StorageLayout //===----------------------------------------------------------------------===// From af4da3d7463028542c77397c3da0ced76e1e97c3 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Fri, 19 May 2023 07:35:08 +0000 Subject: [PATCH 171/704] [CMake] Remove BOLT from Fuchsia toolchain Fuchsia toolchain is currently broken due to https://reviews.llvm.org/D151595. While we wait for it to be resolved, remove BOLT to unbreak the Fuchsia toolchain build. Differential Revision: https://reviews.llvm.org/D150939 --- clang/cmake/caches/Fuchsia-stage2.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index 8f5aa21be6fb8..2465cc7b8c2f8 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -6,7 +6,7 @@ set(LLVM_TARGETS_TO_BUILD X86;ARM;AArch64;RISCV CACHE STRING "") set(PACKAGE_VENDOR Fuchsia CACHE STRING "") -set(_FUCHSIA_ENABLE_PROJECTS "bolt;clang;clang-tools-extra;lld;llvm;polly") +set(_FUCHSIA_ENABLE_PROJECTS "clang;clang-tools-extra;lld;llvm;polly") set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "") set(LLVM_ENABLE_BACKTRACES OFF CACHE BOOL "") @@ -328,7 +328,6 @@ set(LLVM_TOOLCHAIN_TOOLS CACHE STRING "") set(LLVM_Toolchain_DISTRIBUTION_COMPONENTS - bolt clang lld clang-apply-replacements From 8148fc576accfadb5511777d3f30fbde15446a28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Villegas?= Date: Tue, 30 May 2023 16:03:37 -0700 Subject: [PATCH 172/704] [llvm-debuginfod][NFC] Switch to OptTable Reviewed By: mysterymath Differential Revision: https://reviews.llvm.org/D151273 --- llvm/tools/llvm-debuginfod/CMakeLists.txt | 8 + llvm/tools/llvm-debuginfod/Opts.td | 20 +++ .../tools/llvm-debuginfod/llvm-debuginfod.cpp | 140 +++++++++++++----- .../llvm/tools/llvm-debuginfod/BUILD.gn | 8 + 4 files changed, 136 insertions(+), 40 deletions(-) create mode 100644 llvm/tools/llvm-debuginfod/Opts.td diff --git a/llvm/tools/llvm-debuginfod/CMakeLists.txt b/llvm/tools/llvm-debuginfod/CMakeLists.txt index 72f2c19848489..d32c6826d7687 100644 --- a/llvm/tools/llvm-debuginfod/CMakeLists.txt +++ b/llvm/tools/llvm-debuginfod/CMakeLists.txt @@ -1,8 +1,16 @@ set(LLVM_LINK_COMPONENTS + Option Support ) +set(LLVM_TARGET_DEFINITIONS Opts.td) +tablegen(LLVM Opts.inc -gen-opt-parser-defs) +add_public_tablegen_target(DebugInfodOptsTableGen) + add_llvm_tool(llvm-debuginfod llvm-debuginfod.cpp + + DEPENDS + DebugInfodOptsTableGen ) target_link_libraries(llvm-debuginfod PRIVATE LLVMDebuginfod) if(LLVM_INSTALL_BINUTILS_SYMLINKS) diff --git a/llvm/tools/llvm-debuginfod/Opts.td b/llvm/tools/llvm-debuginfod/Opts.td new file mode 100644 index 0000000000000..1de241a3fc2a1 --- /dev/null +++ b/llvm/tools/llvm-debuginfod/Opts.td @@ -0,0 +1,20 @@ +include "llvm/Option/OptParser.td" + +class F : Flag<["-"], name>, HelpText; +class FF: Flag<["--"], name>, HelpText; +class S: Separate<["-"], name>, HelpText, MetaVarName; + +def help : FF<"help", "Display available options">; +def : F<"h", "Alias for --help">, Alias; +def max_concurrency : + S<"c", "", "Maximum number of files to scan concurrently. " + "If 0, use the hardware concurrency.">; +def host_interface : S<"i", "", "Host interface to bind to.">; +def min_interval : + S<"m", "", "Minimum number of seconds to wait before an on-demand update can be" + "triggered by a request for a buildid which is not in the collection.">; +def port : S<"p", "", "Port to listen on. Set to 0 to bind to any available port.">; +def scan_interval : + S<"t", "", "Number of seconds to wait between subsequent " + "automated scans of the filesystem.">; +def verbose_logging : F<"v", "Enable verbose logging.">; diff --git a/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp b/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp index c64d4dbb3155f..ecdd06025fbaa 100644 --- a/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp +++ b/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp @@ -15,60 +15,120 @@ /// //===----------------------------------------------------------------------===// +#include "llvm/ADT/StringRef.h" #include "llvm/Debuginfod/Debuginfod.h" #include "llvm/Debuginfod/HTTPClient.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/ThreadPool.h" using namespace llvm; -cl::OptionCategory DebuginfodCategory("llvm-debuginfod Options"); - -static cl::list ScanPaths(cl::Positional, - cl::desc(""), - cl::cat(DebuginfodCategory)); - -static cl::opt - Port("p", cl::init(0), - cl::desc("Port to listen on. Set to 0 to bind to any available port."), - cl::cat(DebuginfodCategory)); - -static cl::opt - HostInterface("i", cl::init("0.0.0.0"), - cl::desc("Host interface to bind to."), - cl::cat(DebuginfodCategory)); - -static cl::opt - ScanInterval("t", cl::init(300), - cl::desc("Number of seconds to wait between subsequent " - "automated scans of the filesystem."), - cl::cat(DebuginfodCategory)); - -static cl::opt MinInterval( - "m", cl::init(10), - cl::desc( - "Minimum number of seconds to wait before an on-demand update can be " - "triggered by a request for a buildid which is not in the collection."), - cl::cat(DebuginfodCategory)); - -static cl::opt - MaxConcurrency("c", cl::init(0), - cl::desc("Maximum number of files to scan concurrently. If " - "0, use the hardware concurrency."), - cl::cat(DebuginfodCategory)); - -static cl::opt VerboseLogging("v", cl::init(false), - cl::desc("Enable verbose logging."), - cl::cat(DebuginfodCategory)); +// Command-line option boilerplate. +namespace { +enum ID { + OPT_INVALID = 0, // This is not an option ID. +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + OPT_##ID, +#include "Opts.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) \ + static constexpr StringLiteral NAME##_init[] = VALUE; \ + static constexpr ArrayRef NAME(NAME##_init, \ + std::size(NAME##_init) - 1); +#include "Opts.inc" +#undef PREFIX + +static constexpr opt::OptTable::Info InfoTable[] = { +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + { \ + PREFIX, NAME, HELPTEXT, \ + METAVAR, OPT_##ID, opt::Option::KIND##Class, \ + PARAM, FLAGS, OPT_##GROUP, \ + OPT_##ALIAS, ALIASARGS, VALUES}, +#include "Opts.inc" +#undef OPTION +}; + +class DebuginfodOptTable : public opt::GenericOptTable { +public: + DebuginfodOptTable() : GenericOptTable(InfoTable) {} +}; +} // end anonymous namespace + +// Options +static unsigned Port; +static std::string HostInterface; +static int ScanInterval; +static double MinInterval; +static size_t MaxConcurrency; +static bool VerboseLogging; +static std::vector ScanPaths; ExitOnError ExitOnErr; +template +static void parseIntArg(const opt::InputArgList &Args, int ID, T &Value, + T Default) { + if (const opt::Arg *A = Args.getLastArg(ID)) { + StringRef V(A->getValue()); + if (!llvm::to_integer(V, Value, 0)) { + errs() << A->getSpelling() + ": expected an integer, but got '" + V + "'"; + exit(1); + } + } else { + Value = Default; + } +} + +static void parseArgs(int argc, char **argv) { + DebuginfodOptTable Tbl; + llvm::StringRef ToolName = argv[0]; + llvm::BumpPtrAllocator A; + llvm::StringSaver Saver{A}; + opt::InputArgList Args = + Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) { + llvm::errs() << Msg << '\n'; + std::exit(1); + }); + + if (Args.hasArg(OPT_help)) { + Tbl.printHelp(llvm::outs(), + "llvm-debuginfod [options] ", + ToolName.str().c_str()); + std::exit(0); + } + + VerboseLogging = Args.hasArg(OPT_verbose_logging); + ScanPaths = Args.getAllArgValues(OPT_INPUT); + + parseIntArg(Args, OPT_port, Port, 0u); + parseIntArg(Args, OPT_scan_interval, ScanInterval, 300); + parseIntArg(Args, OPT_max_concurrency, MaxConcurrency, size_t(0)); + + if (const opt::Arg *A = Args.getLastArg(OPT_min_interval)) { + StringRef V(A->getValue()); + if (!llvm::to_float(V, MinInterval)) { + errs() << A->getSpelling() + ": expected a number, but got '" + V + "'"; + exit(1); + } + } else { + MinInterval = 10.0; + } + + HostInterface = Args.getLastArgValue(OPT_host_interface, "0.0.0.0"); +} + int main(int argc, char **argv) { InitLLVM X(argc, argv); HTTPClient::initialize(); - cl::HideUnrelatedOptions({&DebuginfodCategory}); - cl::ParseCommandLineOptions(argc, argv); + parseArgs(argc, argv); SmallVector Paths; for (const std::string &Path : ScanPaths) diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod/BUILD.gn index c8ee330a867cb..236124f351bf7 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod/BUILD.gn @@ -1,6 +1,12 @@ import("//llvm/tools/binutils_symlinks.gni") +import("//llvm/utils/TableGen/tablegen.gni") import("//llvm/utils/gn/build/symlink_or_copy.gni") +tablegen("Opts") { + visibility = [ ":llvm-debuginfod" ] + args = [ "-gen-opt-parser-defs" ] +} + if (llvm_install_binutils_symlinks) { symlink_or_copy("debuginfod") { deps = [ ":llvm-debuginfod" ] @@ -19,7 +25,9 @@ group("symlinks") { executable("llvm-debuginfod") { deps = [ + ":Opts", "//llvm/lib/Debuginfod", + "//llvm/lib/Option", "//llvm/lib/Support", ] sources = [ "llvm-debuginfod.cpp" ] From 68ef0e95b20ac1bebb119977fe7c9ac08a764ebe Mon Sep 17 00:00:00 2001 From: Spenser Bauman Date: Tue, 30 May 2023 14:45:19 -0700 Subject: [PATCH 173/704] [mlir][tosa] Implement lowering for tosa.rfft2d Implement a lowering for tosa.rfft2d to linalg.generic in the TosaToLinalg transform. Reviewed By: eric-k256 Differential Revision: https://reviews.llvm.org/D151095 --- .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 160 ++++++++++++++++++ .../TosaToLinalg/tosa-to-linalg.mlir | 129 ++++++++++++++ 2 files changed, 289 insertions(+) diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 9e0cccff6cf99..0ca05882cca74 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -12,7 +12,9 @@ #include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h" #include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Arith/Utils/Utils.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/Utils/IndexingUtils.h" #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" @@ -20,6 +22,7 @@ #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h" #include "mlir/Dialect/Utils/ReshapeOpsUtils.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/ImplicitLocOpBuilder.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" @@ -2021,6 +2024,162 @@ class TableConverter : public OpRewritePattern { } }; +struct RFFT2dConverter final : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + static bool isRankedTensor(Type type) { return isa(type); } + + static OpFoldResult halfPlusOne(OpBuilder &builder, Location loc, + OpFoldResult ofr) { + auto one = builder.create(loc, 1); + auto two = builder.create(loc, 2); + + auto value = getValueOrCreateConstantIndexOp(builder, loc, ofr); + auto divBy2 = builder.createOrFold(loc, value, two); + auto plusOne = builder.createOrFold(loc, divBy2, one); + return getAsOpFoldResult(plusOne); + } + + static RankedTensorType + computeOutputShape(OpBuilder &builder, Location loc, Value input, + llvm::SmallVectorImpl &dynamicSizes) { + // Get [N, H, W] + auto dims = linalg::getMixedDimensions(builder, loc, input); + + // Set W = (W / 2) + 1 to account for the half-sized W dimension of the + // output tensors. + dims[2] = halfPlusOne(builder, loc, dims[2]); + + llvm::SmallVector staticSizes; + dispatchIndexOpFoldResults(dims, dynamicSizes, staticSizes); + + auto elementType = + input.getType().cast().getElementType(); + return RankedTensorType::get(staticSizes, elementType); + } + + static Value createZeroTensor(PatternRewriter &rewriter, Location loc, + RankedTensorType type, + llvm::ArrayRef dynamicSizes) { + auto emptyTensor = + rewriter.create(loc, type, dynamicSizes); + auto fillValueAttr = rewriter.getZeroAttr(type.getElementType()); + auto fillValue = rewriter.create(loc, fillValueAttr); + auto filledTensor = rewriter + .create(loc, ValueRange{fillValue}, + ValueRange{emptyTensor}) + .result(); + return filledTensor; + } + + static Value castIndexToFloat(OpBuilder &builder, Location loc, + FloatType type, Value value) { + auto integerVal = + builder.create(loc, builder.getI64Type(), value); + + return builder.create(loc, type, integerVal); + } + + static Value createLinalgIndex(OpBuilder &builder, Location loc, + FloatType type, int64_t index) { + auto indexVal = builder.create(loc, index); + return castIndexToFloat(builder, loc, type, indexVal); + } + + template + static llvm::SmallVector affineDimsExpr(OpBuilder &builder, + Args... args) { + return {builder.getAffineDimExpr(args)...}; + } + + LogicalResult matchAndRewrite(RFFT2dOp rfft2d, + PatternRewriter &rewriter) const override { + if (!llvm::all_of(rfft2d->getOperandTypes(), isRankedTensor) || + !llvm::all_of(rfft2d->getResultTypes(), isRankedTensor)) { + return rewriter.notifyMatchFailure(rfft2d, + "only supports ranked tensors"); + } + + auto loc = rfft2d.getLoc(); + auto input = rfft2d.getInput(); + auto elementType = + input.getType().cast().getElementType().cast(); + + // Compute the output type and set of dynamic sizes + llvm::SmallVector dynamicSizes; + auto outputType = computeOutputShape(rewriter, loc, input, dynamicSizes); + + // Iterator types for the linalg.generic implementation + llvm::SmallVector iteratorTypes = { + utils::IteratorType::parallel, utils::IteratorType::parallel, + utils::IteratorType::parallel, utils::IteratorType::reduction, + utils::IteratorType::reduction}; + + // Inputs/outputs to the linalg.generic implementation + llvm::SmallVector genericOpInputs = {input}; + llvm::SmallVector genericOpOutputs = { + createZeroTensor(rewriter, loc, outputType, dynamicSizes), + createZeroTensor(rewriter, loc, outputType, dynamicSizes)}; + + // Indexing maps for input and output tensors + auto indexingMaps = AffineMap::inferFromExprList(llvm::ArrayRef{ + affineDimsExpr(rewriter, 0, 3, 4), affineDimsExpr(rewriter, 0, 1, 2), + affineDimsExpr(rewriter, 0, 1, 2)}); + + // Width and height dimensions of the original input. + auto dimH = linalg::createOrFoldDimOp(rewriter, loc, input, 1); + auto dimW = linalg::createOrFoldDimOp(rewriter, loc, input, 2); + + // Constants and dimension sizes + auto twoPiAttr = rewriter.getFloatAttr(elementType, 6.283185307179586); + auto twoPi = rewriter.create(loc, twoPiAttr); + auto constH = castIndexToFloat(rewriter, loc, elementType, dimH); + auto constW = castIndexToFloat(rewriter, loc, elementType, dimW); + + auto buildBody = [&](OpBuilder &builder, Location loc, ValueRange args) { + Value valReal = args[0]; + Value sumReal = args[1]; + Value sumImag = args[2]; + + // Indices for angle computation + auto oy = createLinalgIndex(builder, loc, elementType, 1); + auto ox = createLinalgIndex(builder, loc, elementType, 2); + auto iy = createLinalgIndex(builder, loc, elementType, 3); + auto ix = createLinalgIndex(builder, loc, elementType, 4); + + // angle = 2 * pi() * ((iy * oy) / H + (ix * ox) / W) + auto iyXoy = builder.create(loc, iy, oy); + auto ixXox = builder.create(loc, ix, ox); + auto yComponent = builder.create(loc, iyXoy, constH); + auto xComponent = builder.create(loc, ixXox, constW); + auto sumXY = builder.create(loc, yComponent, xComponent); + auto angle = builder.create(loc, twoPi, sumXY); + + // realComponent = valReal * cos(angle) + // imagComponent = valReal * sin(angle) + auto cosAngle = builder.create(loc, angle); + auto sinAngle = builder.create(loc, angle); + auto realComponent = + builder.create(loc, valReal, cosAngle); + auto imagComponent = + builder.create(loc, valReal, sinAngle); + + // outReal = sumReal + realComponent + // outImag = sumImag - imagComponent + auto outReal = builder.create(loc, sumReal, realComponent); + auto outImag = builder.create(loc, sumImag, imagComponent); + + builder.create(loc, ValueRange{outReal, outImag}); + }; + + rewriter.replaceOpWithNewOp( + rfft2d, rfft2d.getResultTypes(), genericOpInputs, genericOpOutputs, + indexingMaps, iteratorTypes, buildBody); + + return success(); + } +}; + } // namespace void mlir::tosa::populateTosaToLinalgConversionPatterns( @@ -2083,6 +2242,7 @@ void mlir::tosa::populateTosaToLinalgConversionPatterns( GatherConverter, RescaleConverter, ReverseConverter, + RFFT2dConverter, TableConverter, TileConverter, TransposeConverter>(patterns->getContext()); diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index 9e5615e5c33f9..1f66c669bafb6 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -1412,3 +1412,132 @@ func.func @select_fp32(%arg0: tensor<1x1x5x5xi1>, %arg1: tensor<1x12x5x5xf32>, % return %0 : tensor<1x12x5x5xf32> } +// ----- + +// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> + +// CHECK-LABEL: @test_static_rfft2d +// CHECK-SAME: (%[[ARG_0:[0-9a-zA-Z_]*]]: +func.func @test_static_rfft2d(%arg0: tensor<5x5x8xf32>) -> (tensor<5x5x5xf32>, tensor<5x5x5xf32>) { +// CHECK: %[[CST_1:.*]] = arith.constant 1 : index +// CHECK: %[[CST_2:.*]] = arith.constant 2 : index +// CHECK: %[[CST_8:.*]] = arith.constant 8 : index +// CHECK: %[[CST_4:.*]] = arith.constant 4 : index +// CHECK: %[[CST_5:.*]] = arith.constant 5 : index +// CHECK: %[[EMPTY_0:.*]] = tensor.empty() : tensor<5x5x5xf32> +// CHECK: %[[CST_ZERO:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAR_1:.*]] = linalg.fill ins(%[[CST_ZERO:.*]] : f32) outs(%[[EMPTY_0:.*]] : tensor<5x5x5xf32>) -> tensor<5x5x5xf32> +// CHECK: %[[EMPTY_1:.*]] = tensor.empty() : tensor<5x5x5xf32> +// CHECK: %[[VAR_3:.*]] = linalg.fill ins(%[[CST_ZERO:.*]]: f32) outs(%[[EMPTY_1:.*]] : tensor<5x5x5xf32>) -> tensor<5x5x5xf32> +// CHECK: %[[CST_PI:.*]] = arith.constant 6.28318548 : f32 +// CHECK: %[[VAR_5:.*]] = arith.index_castui %[[CST_5:.*]] : index to i64 +// CHECK: %[[VAR_6:.*]] = arith.uitofp %[[VAR_5:.*]] : i64 to f32 +// CHECK: %[[VAR_7:.*]] = arith.index_castui %[[CST_8:.*]] : index to i64 +// CHECK: %[[VAR_8:.*]] = arith.uitofp %[[VAR_7:.*]] : i64 to f32 +// CHECK: linalg.generic { +// CHECK: indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], +// CHECK: iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} +// CHECK: ins(%[[ARG_0]] : tensor<5x5x8xf32>) +// CHECK: outs(%[[VAR_1]], %[[VAR_3]] : tensor<5x5x5xf32>, tensor<5x5x5xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT_0:.*]]: f32, %[[OUT_1:.*]]: f32): +// CHECK: %[[INDEX_1:.*]] = linalg.index 1 : index +// CHECK: %[[VAR_10:.*]] = arith.index_castui %[[INDEX_1]] : index to i64 +// CHECK: %[[VAR_11:.*]] = arith.uitofp %[[VAR_10]] : i64 to f32 +// CHECK: %[[INDEX_2:.*]] = linalg.index 2 : index +// CHECK: %[[VAR_13:.*]] = arith.index_castui %[[INDEX_2]] : index to i64 +// CHECK: %[[VAR_14:.*]] = arith.uitofp %[[VAR_13]] : i64 to f32 +// CHECK: %[[INDEX_3:.*]] = linalg.index 3 : index +// CHECK: %[[VAR_16:.*]] = arith.index_castui %[[INDEX_3]] : index to i64 +// CHECK: %[[VAR_17:.*]] = arith.uitofp %[[VAR_16]] : i64 to f32 +// CHECK: %[[INDEX_4:.*]] = linalg.index 4 : index +// CHECK: %[[VAR_19:.*]] = arith.index_castui %[[INDEX_4]] : index to i64 +// CHECK: %[[VAR_20:.*]] = arith.uitofp %[[VAR_19]] : i64 to f32 +// CHECK: %[[VAR_21:.*]] = arith.mulf %[[VAR_17]], %[[VAR_11]] : f32 +// CHECK: %[[VAR_22:.*]] = arith.mulf %[[VAR_20]], %[[VAR_14]] : f32 +// CHECK: %[[XCOMP:.*]] = arith.divf %[[VAR_21]], %[[VAR_6]] : f32 +// CHECK: %[[YCOMP:.*]] = arith.divf %[[VAR_22]], %[[VAR_8]] : f32 +// CHECK: %[[VAR_25:.*]] = arith.addf %[[XCOMP]], %[[YCOMP]] : f32 +// CHECK: %[[ALPHA:.*]] = arith.mulf %[[CST_PI]], %[[VAR_25]] : f32 +// CHECK: %[[COS_ALPHA:.*]] = math.cos %[[ALPHA]] : f32 +// CHECK: %[[SIN_ALPHA:.*]] = math.sin %[[ALPHA]] : f32 +// CHECK: %[[REAL_CONTRIB:.*]] = arith.mulf %[[IN]], %[[COS_ALPHA]] : f32 +// CHECK: %[[IMAG_CONTRIB:.*]] = arith.mulf %[[IN]], %[[SIN_ALPHA]] : f32 +// CHECK: %[[OUT_REAL:.*]] = arith.addf %[[OUT_0]], %[[REAL_CONTRIB]] : f32 +// CHECK: %[[OUT_IMAG:.*]] = arith.subf %[[OUT_1]], %[[IMAG_CONTRIB]] : f32 +// CHECK: linalg.yield %[[OUT_REAL]], %[[OUT_IMAG]] : f32, f32 +// CHECK: } -> (tensor<5x5x5xf32>, tensor<5x5x5xf32>) + + %output_real, %output_imag = "tosa.rfft2d"(%arg0) {} : (tensor<5x5x8xf32>) -> (tensor<5x5x5xf32>, tensor<5x5x5xf32>) + return %output_real, %output_imag : tensor<5x5x5xf32>, tensor<5x5x5xf32> +} + +// ----- + +// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d4)> +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> + +// CHECK-LABEL: @test_dynamic_rfft2d +// CHECK-SAME: (%[[ARG_0:[0-9a-zA-Z_]*]]: +func.func @test_dynamic_rfft2d(%arg0: tensor) -> (tensor, tensor) { +// CHECK: %[[CST_0:.*]] = arith.constant 0 : index +// CHECK: %[[DIM:.*]] = tensor.dim %[[ARG_0]], %[[CST_0]] : tensor +// CHECK: %[[CST_1:.*]] = arith.constant 1 : index +// CHECK: %[[DIM_0:.*]] = tensor.dim %[[ARG_0]], %[[CST_1]] : tensor +// CHECK: %[[CST_2:.*]] = arith.constant 2 : index +// CHECK: %[[DIM_1:.*]] = tensor.dim %[[ARG_0]], %[[CST_2]] : tensor +// CHECK: %[[CST_1_2:.*]] = arith.constant 1 : index +// CHECK: %[[CST_2_3:.*]] = arith.constant 2 : index +// CHECK: %[[VAR_0:.*]] = arith.divui %[[DIM_1]], %[[CST_2_3]] : index +// CHECK: %[[VAR_1:.*]] = arith.addi %[[VAR_0]], %[[CST_1_2]] : index +// CHECK: %[[EMPTY_0:.*]] = tensor.empty(%[[DIM]], %[[DIM_0]], %[[VAR_1]]) : tensor +// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAR_3:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[EMPTY_0]] : tensor) -> tensor +// CHECK: %[[EMPTY_1:.*]] = tensor.empty(%[[DIM]], %[[DIM_0]], %[[VAR_1]]) : tensor +// CHECK: %[[CST_4:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAR_5:.*]] = linalg.fill ins(%[[CST_4]] : f32) outs(%[[EMPTY_1]] : tensor) -> tensor +// CHECK: %[[CST_1_5:.*]] = arith.constant 1 : index +// CHECK: %[[DIM_6:.*]] = tensor.dim %[[ARG_0]], %[[CST_1_5]] : tensor +// CHECK: %[[CST_2:.*]] = arith.constant 2 : index +// CHECK: %[[DIM_8:.*]] = tensor.dim %[[ARG_0]], %[[CST_2]] : tensor +// CHECK: %[[CST_9:.*]] = arith.constant 6.28318548 : f32 +// CHECK: %[[VAR_6:.*]] = arith.index_castui %[[DIM_6]] : index to i64 +// CHECK: %[[VAR_7:.*]] = arith.uitofp %[[VAR_6]] : i64 to f32 +// CHECK: %[[VAR_8:.*]] = arith.index_castui %[[DIM_8]] : index to i64 +// CHECK: %[[VAR_9:.*]] = arith.uitofp %[[VAR_8]] : i64 to f32 +// CHECK: linalg.generic { +// CHECK: indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]]], +// CHECK: iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]} +// CHECK: ins(%[[ARG_0]] : tensor) +// CHECK: outs(%[[VAR_3]], %[[VAR_5]] : tensor, tensor) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT_0:.*]]: f32, %[[OUT_1:.*]]: f32): +// CHECK: %[[INDEX_1:.*]] = linalg.index 1 : index +// CHECK: %[[VAR_12:.*]] = arith.index_castui %[[INDEX_1]] : index to i64 +// CHECK: %[[VAR_13:.*]] = arith.uitofp %[[VAR_12]] : i64 to f32 +// CHECK: %[[INDEX_2:.*]] = linalg.index 2 : index +// CHECK: %[[VAR_15:.*]] = arith.index_castui %[[INDEX_2]] : index to i64 +// CHECK: %[[VAR_16:.*]] = arith.uitofp %[[VAR_15]] : i64 to f32 +// CHECK: %[[INDEX_3:.*]] = linalg.index 3 : index +// CHECK: %[[VAR_18:.*]] = arith.index_castui %[[INDEX_3]] : index to i64 +// CHECK: %[[VAR_19:.*]] = arith.uitofp %[[VAR_18]] : i64 to f32 +// CHECK: %[[INDEX_4:.*]] = linalg.index 4 : index +// CHECK: %[[VAR_21:.*]] = arith.index_castui %[[INDEX_4]] : index to i64 +// CHECK: %[[VAR_22:.*]] = arith.uitofp %[[VAR_21]] : i64 to f32 +// CHECK: %[[VAR_23:.*]] = arith.mulf %[[VAR_19]], %[[VAR_13]] : f32 +// CHECK: %[[VAR_24:.*]] = arith.mulf %[[VAR_22]], %[[VAR_16]] : f32 +// CHECK: %[[XCOMP:.*]] = arith.divf %[[VAR_23]], %[[VAR_7]] : f32 +// CHECK: %[[YCOMP:.*]] = arith.divf %[[VAR_24]], %[[VAR_9]] : f32 +// CHECK: %[[VAR_27:.*]] = arith.addf %[[XCOMP]], %[[YCOMP]] : f32 +// CHECK: %[[ALPHA:.*]] = arith.mulf %[[CST_9]], %[[VAR_27]] : f32 +// CHECK: %[[COS_ALPHA:.*]] = math.cos %[[ALPHA]] : f32 +// CHECK: %[[SIN_ALPHA:.*]] = math.sin %[[ALPHA]] : f32 +// CHECK: %[[REAL_CONTRIB:.*]] = arith.mulf %[[IN]], %[[COS_ALPHA]] : f32 +// CHECK: %[[IMAG_CONTRIB:.*]] = arith.mulf %[[IN]], %[[SIN_ALPHA]] : f32 +// CHECK: %[[OUT_REAL:.*]] = arith.addf %[[OUT_0]], %[[REAL_CONTRIB]] : f32 +// CHECK: %[[OUT_IMAG:.*]] = arith.subf %[[OUT_1]], %[[IMAG_CONTRIB]] : f32 +// CHECK: linalg.yield %[[OUT_REAL]], %[[OUT_IMAG]] : f32, f32 +// CHECK: } -> (tensor, tensor) + + %output_real, %output_imag = "tosa.rfft2d"(%arg0) {} : (tensor) -> (tensor, tensor) + return %output_real, %output_imag : tensor, tensor +} From 749aeeaf66a483345e12f69cc1f55dad4e997297 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Wed, 31 May 2023 07:20:24 +0800 Subject: [PATCH 174/704] [clang][ExtractAPI] Fix -Wpessimizing-move in DeclarationFragments.h (NFC) /Users/jiefu/llvm-project/clang/include/clang/ExtractAPI/DeclarationFragments.h:118:26: error: moving a temporary object prevents copy elision [-Werror,-Wpessimizing-move] Fragments.insert(It, std::move(Fragment(Spelling, Kind, PreciseIdentifier, ^ /Users/jiefu/llvm-project/clang/include/clang/ExtractAPI/DeclarationFragments.h:118:26: note: remove std::move call here Fragments.insert(It, std::move(Fragment(Spelling, Kind, PreciseIdentifier, ^~~~~~~~~~ 1 error generated. --- clang/include/clang/ExtractAPI/DeclarationFragments.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/ExtractAPI/DeclarationFragments.h b/clang/include/clang/ExtractAPI/DeclarationFragments.h index 4c1b830807047..3b909b066866e 100644 --- a/clang/include/clang/ExtractAPI/DeclarationFragments.h +++ b/clang/include/clang/ExtractAPI/DeclarationFragments.h @@ -115,8 +115,8 @@ class DeclarationFragments { FragmentKind Kind, StringRef PreciseIdentifier = "", const Decl *Declaration = nullptr) { - Fragments.insert(It, std::move(Fragment(Spelling, Kind, PreciseIdentifier, - Declaration))); + Fragments.insert(It, Fragment(Spelling, Kind, PreciseIdentifier, + Declaration)); return *this; } From d54b2d1c01d05dd0e29e0a7fd90adacc8998841a Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 30 May 2023 16:28:22 -0700 Subject: [PATCH 175/704] [ExtractAPI] clang-format DeclarationFragments::insert --- clang/include/clang/ExtractAPI/DeclarationFragments.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/ExtractAPI/DeclarationFragments.h b/clang/include/clang/ExtractAPI/DeclarationFragments.h index 3b909b066866e..82f0c42ab8aa0 100644 --- a/clang/include/clang/ExtractAPI/DeclarationFragments.h +++ b/clang/include/clang/ExtractAPI/DeclarationFragments.h @@ -115,8 +115,8 @@ class DeclarationFragments { FragmentKind Kind, StringRef PreciseIdentifier = "", const Decl *Declaration = nullptr) { - Fragments.insert(It, Fragment(Spelling, Kind, PreciseIdentifier, - Declaration)); + Fragments.insert(It, + Fragment(Spelling, Kind, PreciseIdentifier, Declaration)); return *this; } From 0016141a9ae34a2022021da24821d68890ec45ab Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 30 May 2023 19:31:27 -0400 Subject: [PATCH 176/704] [gn] port 48e5f704c55f (no more LLVM_HAVE_LIBXAR in ld64.lld) --- llvm/utils/gn/secondary/lld/test/BUILD.gn | 7 ------- 1 file changed, 7 deletions(-) diff --git a/llvm/utils/gn/secondary/lld/test/BUILD.gn b/llvm/utils/gn/secondary/lld/test/BUILD.gn index 5786b03091223..01ee607e582eb 100644 --- a/llvm/utils/gn/secondary/lld/test/BUILD.gn +++ b/llvm/utils/gn/secondary/lld/test/BUILD.gn @@ -1,6 +1,5 @@ import("//llvm/lib/DebugInfo/PDB/enable_dia.gni") import("//llvm/triples.gni") -import("//llvm/utils/gn/build/libs/xar/enable.gni") import("//llvm/utils/gn/build/libs/xml/enable.gni") import("//llvm/utils/gn/build/libs/zlib/enable.gni") import("//llvm/utils/gn/build/write_cmake_config.gni") @@ -66,12 +65,6 @@ write_cmake_config("lit_site_cfg") { values += [ "LLVM_ENABLE_DIA_SDK=0" ] # Must be 0. } - if (llvm_enable_libxar) { - values += [ "LLVM_HAVE_LIBXAR=1" ] - } else { - values += [ "LLVM_HAVE_LIBXAR=0" ] # Must be 0. - } - if (llvm_enable_libxml2) { values += [ "LLVM_ENABLE_LIBXML2=1" ] } else { From 79fadde50a7e3200e1e3576852f1aa932459ec1e Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 30 May 2023 23:31:47 +0000 Subject: [PATCH 177/704] [gn build] Port bf63b15bd4bf --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 296ac29d31c8f..98cfb2be947dd 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -153,6 +153,7 @@ if (current_toolchain == default_toolchain) { "__algorithm/pstl_backends/cpu_backends/fill.h", "__algorithm/pstl_backends/cpu_backends/find_if.h", "__algorithm/pstl_backends/cpu_backends/for_each.h", + "__algorithm/pstl_backends/cpu_backends/merge.h", "__algorithm/pstl_backends/cpu_backends/serial.h", "__algorithm/pstl_backends/cpu_backends/thread.h", "__algorithm/pstl_backends/cpu_backends/transform.h", @@ -161,6 +162,7 @@ if (current_toolchain == default_toolchain) { "__algorithm/pstl_find.h", "__algorithm/pstl_for_each.h", "__algorithm/pstl_frontend_dispatch.h", + "__algorithm/pstl_merge.h", "__algorithm/pstl_transform.h", "__algorithm/push_heap.h", "__algorithm/ranges_adjacent_find.h", From aaa33b6a98de2be7cdc827b13e60c103206d6461 Mon Sep 17 00:00:00 2001 From: Jennifer Yu Date: Tue, 30 May 2023 10:20:31 -0700 Subject: [PATCH 178/704] Fix assert "DeclRefExpr for Decl not entered in LocalDeclMap?" Currently compiler assert when passing variable "memspace" in omp_init_allocator. omp_allocator_handle_t alloc=omp_init_allocator(memspace,1,traits) The problem is memspace is not mapping to the target region. During the call to emitAllocatorInit, calls to EmitVarDecl for "alloc", then emit initialization of "alloc" that cause to assert. If I understant correct, it is not necessary to emit variable initialization, since "allocator" is private to target region. To fix this call CGF.EmitAutoVarAlloca(allocator) instead CGF.EmitVarDecl(allocator). Differential Revision: https://reviews.llvm.org/D151743 --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 2 +- clang/test/OpenMP/target_uses_allocators.c | 41 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 1f1db83378233..5957e59097709 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -6041,7 +6041,7 @@ void CGOpenMPRuntime::emitUsesAllocatorsInit(CodeGenFunction &CGF, CGM.getModule(), OMPRTL___kmpc_init_allocator), {ThreadId, MemSpaceHandle, NumTraits, Traits}); // Store to allocator. - CGF.EmitVarDecl(*cast( + CGF.EmitAutoVarAlloca(*cast( cast(Allocator->IgnoreParenImpCasts())->getDecl())); LValue AllocatorLVal = CGF.EmitLValue(Allocator->IgnoreParenImpCasts()); AllocatorVal = diff --git a/clang/test/OpenMP/target_uses_allocators.c b/clang/test/OpenMP/target_uses_allocators.c index eab202671e793..0352a5874bf12 100644 --- a/clang/test/OpenMP/target_uses_allocators.c +++ b/clang/test/OpenMP/target_uses_allocators.c @@ -64,6 +64,35 @@ void fie(void) { {} } +typedef enum omp_memspace_handle_t { + omp_default_mem_space = 0, + omp_large_cap_mem_space = 1, + omp_const_mem_space = 2, + omp_high_bw_mem_space = 3, + omp_low_lat_mem_space = 4, + llvm_omp_target_host_mem_space = 100, + llvm_omp_target_shared_mem_space = 101, + llvm_omp_target_device_mem_space = 102, + KMP_MEMSPACE_MAX_HANDLE = __UINTPTR_MAX__ +} omp_memspace_handle_t; + +extern omp_allocator_handle_t +omp_init_allocator(omp_memspace_handle_t memspace, int ntraits, + const omp_alloctrait_t traits[]); + +void *omp_aligned_alloc(unsigned long alignment, unsigned long size, + omp_allocator_handle_t allocator); +extern void * omp_alloc(int size, omp_allocator_handle_t a); +#define N 1024 + +void foo() { + int errors = 0; + omp_memspace_handle_t memspace = omp_default_mem_space; + omp_alloctrait_t traits[1] = {{omp_atk_alignment, 64}}; + omp_allocator_handle_t alloc = omp_init_allocator(memspace,1,traits); + #pragma omp target map(tofrom: errors) uses_allocators(alloc(traits)) + { } +} #endif // CHECK: %[[#R0:]] = call i32 @__kmpc_global_thread_num(ptr @1) @@ -140,3 +169,15 @@ void fie(void) { // CHECK: [[ALLOCATOR:%.+]] = load i64, ptr [[MY_ALLOCATOR_ADDR]], // CHECK: [[CONV:%.+]] = inttoptr i64 [[ALLOCATOR]] to ptr // CHECK: call void @__kmpc_destroy_allocator(i32 %{{.+}}, ptr [[CONV]]) + +// CHECK: [[TRAITS_ADDR_REF:%.+]] = alloca ptr, +// CHECK: [[MY_ALLOCATOR_ADDR:%alloc]] = alloca i64, +// CHECK: [[TRAITS_ADDR:%.+]] = load ptr, ptr [[TRAITS_ADDR_REF]], +// CHECK: [[ALLOCATOR:%.+]] = call ptr @__kmpc_init_allocator(i32 %{{.+}}, ptr null, i32 1, ptr [[TRAITS_ADDR]]) +// CHECK: [[CONV:%.+]] = ptrtoint ptr [[ALLOCATOR]] to i64 +// CHECK: store i64 [[CONV]], ptr [[MY_ALLOCATOR_ADDR]], + +// Destroy allocator upon exit from the region. +// CHECK: [[ALLOCATOR:%.+]] = load i64, ptr [[MY_ALLOCATOR_ADDR]], +// CHECK: [[CONV1:%.+]] = inttoptr i64 [[ALLOCATOR]] to ptr +// CHECK: call void @__kmpc_destroy_allocator(i32 %{{.+}}, ptr [[CONV1]]) From 14186773e79b8c6787afac2f9ee69738151377ec Mon Sep 17 00:00:00 2001 From: Jim Ingham Date: Tue, 30 May 2023 17:12:35 -0700 Subject: [PATCH 179/704] Fix SBValue::FindValue for file static variables This was just a thinko. The API StackFrame::GetVariableList takes a bool for "get_file_globals" which if true will also find file statics and file globals. But we only were passing that as true if the ValueType was eValueTypeVariableGlobal, which meant that we never find file statics. It's okay if we cast too wide a net when we do GetVariableList as later on we check against the ValueType to filter globals from statics. There was a test that had a whole bunch of globals and tested FindValue on all of them, but had no statics. So I just made one of the globals a file static, which verifies the fix. Differential Revision: https://reviews.llvm.org/D151392 --- lldb/source/API/SBFrame.cpp | 3 ++- lldb/test/API/python_api/process/TestProcessAPI.py | 8 ++++---- lldb/test/API/python_api/process/main.cpp | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/lldb/source/API/SBFrame.cpp b/lldb/source/API/SBFrame.cpp index 285469c1063b3..e31297446007a 100644 --- a/lldb/source/API/SBFrame.cpp +++ b/lldb/source/API/SBFrame.cpp @@ -602,7 +602,8 @@ SBValue SBFrame::FindValue(const char *name, ValueType value_type, stop_if_block_is_inlined_function, [frame](Variable *v) { return v->IsInScope(frame); }, &variable_list); - if (value_type == eValueTypeVariableGlobal) { + if (value_type == eValueTypeVariableGlobal + || value_type == eValueTypeVariableStatic) { const bool get_file_globals = true; VariableList *frame_vars = frame->GetVariableList(get_file_globals, nullptr); diff --git a/lldb/test/API/python_api/process/TestProcessAPI.py b/lldb/test/API/python_api/process/TestProcessAPI.py index df41397eb32b8..65330e5163f72 100644 --- a/lldb/test/API/python_api/process/TestProcessAPI.py +++ b/lldb/test/API/python_api/process/TestProcessAPI.py @@ -49,8 +49,8 @@ def test_read_memory(self): ) frame = thread.GetFrameAtIndex(0) - # Get the SBValue for the global variable 'my_char'. - val = frame.FindValue("my_char", lldb.eValueTypeVariableGlobal) + # Get the SBValue for the file static variable 'my_char'. + val = frame.FindValue("my_char", lldb.eValueTypeVariableStatic) self.DebugSBValue(val) # Due to the typemap magic (see lldb.swig), we pass in 1 to ReadMemory and @@ -149,8 +149,8 @@ def test_write_memory(self): ) frame = thread.GetFrameAtIndex(0) - # Get the SBValue for the global variable 'my_char'. - val = frame.FindValue("my_char", lldb.eValueTypeVariableGlobal) + # Get the SBValue for the static variable 'my_char'. + val = frame.FindValue("my_char", lldb.eValueTypeVariableStatic) self.DebugSBValue(val) # If the variable does not have a load address, there's no sense diff --git a/lldb/test/API/python_api/process/main.cpp b/lldb/test/API/python_api/process/main.cpp index 07cde05e2a054..54bf3590ad431 100644 --- a/lldb/test/API/python_api/process/main.cpp +++ b/lldb/test/API/python_api/process/main.cpp @@ -3,7 +3,7 @@ // This simple program is to test the lldb Python API related to process. -char my_char = 'u'; +static char my_char = 'u'; char my_cstring[] = "lldb.SBProcess.ReadCStringFromMemory() works!"; char *my_char_ptr = (char *)"Does it work?"; uint32_t my_uint32 = 12345; From be9b79fb14ce0776e147860c9fbffc84ea7b39e6 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Tue, 30 May 2023 15:54:04 -0700 Subject: [PATCH 180/704] [lldb] Remove commented out code/logging in BreakpointSiteList (NFC) --- lldb/source/Breakpoint/BreakpointSiteList.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/lldb/source/Breakpoint/BreakpointSiteList.cpp b/lldb/source/Breakpoint/BreakpointSiteList.cpp index 32a2f24d411a1..ab15da82ea450 100644 --- a/lldb/source/Breakpoint/BreakpointSiteList.cpp +++ b/lldb/source/Breakpoint/BreakpointSiteList.cpp @@ -48,15 +48,8 @@ bool BreakpointSiteList::ShouldStop(StoppointCallbackContext *context, return true; } lldb::break_id_t BreakpointSiteList::FindIDByAddress(lldb::addr_t addr) { - BreakpointSiteSP bp = FindByAddress(addr); - if (bp) { - // DBLogIf(PD_LOG_BREAKPOINTS, "BreakpointSiteList::%s ( addr = 0x%8.8" - // PRIx64 " ) => %u", __FUNCTION__, (uint64_t)addr, bp->GetID()); + if (BreakpointSiteSP bp = FindByAddress(addr)) return bp.get()->GetID(); - } - // DBLogIf(PD_LOG_BREAKPOINTS, "BreakpointSiteList::%s ( addr = 0x%8.8" - // PRIx64 - // " ) => NONE", __FUNCTION__, (uint64_t)addr); return LLDB_INVALID_BREAK_ID; } From 7a1077baa01cd66afa193276796ee6679954d4e5 Mon Sep 17 00:00:00 2001 From: wren romano <2998727+wrengr@users.noreply.github.com> Date: Tue, 30 May 2023 13:31:49 -0700 Subject: [PATCH 181/704] [mlir][sparse] Improving SparseTensorDimSliceAttr methods This patch makes the following changes to `SparseTensorDimSliceAttr` methods: * Mark `isDynamic` constexpr. * Add new helpers `getStatic` and `getStaticString` to avoid repetition. * Moved the definitions for `getStatic{Offset,Stride,Size}` and `isCompletelyDynamic` out of the class declaration; because there's no benefit to inlining them. * Changed `parse` to use `kDynamic` rather than literals. * Changed `verify` to use the `isDynamic` helper. Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D150919 --- .../SparseTensor/IR/SparseTensorAttrDefs.td | 34 ++------- .../SparseTensor/IR/SparseTensorDialect.cpp | 74 +++++++++++++------ 2 files changed, 60 insertions(+), 48 deletions(-) diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td index 9fe425a40415b..d6c971b0cd36e 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td @@ -76,32 +76,14 @@ def SparseTensorDimSliceAttr : SparseTensor_Attr<"SparseTensorDimSlice", []> { let extraClassDeclaration = [{ /// Special value for dynamic offset/size/stride. static constexpr int64_t kDynamic = -1; - - static bool isDynamic(int64_t v) { - return v == kDynamic; - } - - std::optional getStaticOffset() const { - if (isDynamic(getOffset())) - return std::nullopt; - return static_cast(getOffset()); - }; - - std::optional getStaticStride() const { - if (isDynamic(getStride())) - return std::nullopt; - return static_cast(getStride()); - } - - std::optional getStaticSize() const { - if (isDynamic(getSize())) - return std::nullopt; - return static_cast(getSize()); - } - - bool isCompletelyDynamic() const { - return isDynamic(getOffset()) && isDynamic(getStride()) && isDynamic(getSize()); - }; + static constexpr bool isDynamic(int64_t v) { return v == kDynamic; } + static std::optional getStatic(int64_t v); + static std::string getStaticString(int64_t v); + + std::optional getStaticOffset() const; + std::optional getStaticStride() const; + std::optional getStaticSize() const; + bool isCompletelyDynamic() const; }]; let genVerifyDecl = 1; diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 7f8dcba77fc8e..490e35dfa2d05 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -31,6 +31,23 @@ using namespace mlir; using namespace mlir::sparse_tensor; +//===----------------------------------------------------------------------===// +// Additional convenience methods. +//===----------------------------------------------------------------------===// + +static constexpr bool acceptBitWidth(unsigned bitWidth) { + switch (bitWidth) { + case 0: + case 8: + case 16: + case 32: + case 64: + return true; + default: + return false; + } +} + //===----------------------------------------------------------------------===// // StorageLayout //===----------------------------------------------------------------------===// @@ -166,26 +183,39 @@ StorageLayout::getFieldIndexAndStride(SparseTensorFieldKind kind, // TensorDialect Attribute Methods. //===----------------------------------------------------------------------===// -static bool acceptBitWidth(unsigned bitWidth) { - switch (bitWidth) { - case 0: - case 8: - case 16: - case 32: - case 64: - return true; - default: - return false; - } +std::optional SparseTensorDimSliceAttr::getStatic(int64_t v) { + return isDynamic(v) ? std::nullopt + : std::make_optional(static_cast(v)); +} + +std::optional SparseTensorDimSliceAttr::getStaticOffset() const { + return getStatic(getOffset()); +} + +std::optional SparseTensorDimSliceAttr::getStaticStride() const { + return getStatic(getStride()); +} + +std::optional SparseTensorDimSliceAttr::getStaticSize() const { + return getStatic(getSize()); +} + +bool SparseTensorDimSliceAttr::isCompletelyDynamic() const { + return isDynamic(getOffset()) && isDynamic(getStride()) && + isDynamic(getSize()); +} + +std::string SparseTensorDimSliceAttr::getStaticString(int64_t v) { + return isDynamic(v) ? "?" : std::to_string(v); } void SparseTensorDimSliceAttr::print(AsmPrinter &printer) const { printer << "("; - printer << (getStaticOffset() ? std::to_string(*getStaticOffset()) : "?"); + printer << getStaticString(getOffset()); printer << ", "; - printer << (getStaticSize() ? std::to_string(*getStaticSize()) : "?"); + printer << getStaticString(getSize()); printer << ", "; - printer << (getStaticStride() ? std::to_string(*getStaticStride()) : "?"); + printer << getStaticString(getStride()); printer << ")"; } @@ -208,7 +238,7 @@ static ParseResult parseOptionalStaticSlice(int64_t &result, } Attribute SparseTensorDimSliceAttr::parse(AsmParser &parser, Type type) { - int64_t offset = -1, size = -1, stride = -1; + int64_t offset = kDynamic, size = kDynamic, stride = kDynamic; if (failed(parser.parseLParen()) || failed(parseOptionalStaticSlice(offset, parser)) || @@ -226,13 +256,13 @@ Attribute SparseTensorDimSliceAttr::parse(AsmParser &parser, Type type) { LogicalResult SparseTensorDimSliceAttr::verify(function_ref emitError, int64_t offset, int64_t size, int64_t stride) { - if ((offset == SparseTensorDimSliceAttr::kDynamic || offset >= 0) && - (size == SparseTensorDimSliceAttr::kDynamic || size > 0) && - (stride == SparseTensorDimSliceAttr::kDynamic || stride > 0)) { - return success(); - } - return emitError() - << "expect positive value or ? for slice offset/size/stride"; + if (!isDynamic(offset) && offset < 0) + return emitError() << "expect non-negative value or ? for slice offset"; + if (!isDynamic(size) && size <= 0) + return emitError() << "expect positive value or ? for slice size"; + if (!isDynamic(stride) && stride <= 0) + return emitError() << "expect positive value or ? for slice stride"; + return success(); } Type mlir::sparse_tensor::detail::getIntegerOrIndexType(MLIRContext *ctx, From fef23e8d871add8d885d335e4586a9fa420c1171 Mon Sep 17 00:00:00 2001 From: Pengxuan Zheng Date: Tue, 30 May 2023 10:44:15 -0700 Subject: [PATCH 182/704] [libc++] Include "bits/alltypes.h" to provide mbstate_t when using musl libc With D148542, we ran into the following libc++ build error when using musl libc. ``` .../musl/include/bits/alltypes.h:354:16: error: definition of type '__mbstate_t' conflicts with typedef of the same name typedef struct __mbstate_t { unsigned __opaque1, __opaque2; } mbstate_t; ^ .../sysroot/usr/include/bits/types/__mbstate_t.h:21:3: note: '__mbstate_t' declared here } __mbstate_t; ^ 1 error generated. ``` This is because the mbstate_t definition in musl libc conflicts with the one from "bits/types/mbstate_t.h", and this patch attempts to fix this build issue when musl libc is used. Reviewed By: iana Differential Revision: https://reviews.llvm.org/D151740 --- libcxx/include/__mbstate_t.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libcxx/include/__mbstate_t.h b/libcxx/include/__mbstate_t.h index 000af71119f49..2b9e719b1f2ed 100644 --- a/libcxx/include/__mbstate_t.h +++ b/libcxx/include/__mbstate_t.h @@ -35,7 +35,11 @@ # define __CORRECT_ISO_CPP_WCHAR_H_PROTO #endif -#if __has_include() +#if defined(_LIBCPP_HAS_MUSL_LIBC) +# define __NEED_mbstate_t +# include +# undef __NEED_mbstate_t +#elif __has_include() # include // works on most Unixes #elif __has_include() # include // works on Darwin From bce889c8df419d639beb0d387409d1ecbefdf579 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Tue, 30 May 2023 17:28:22 -0700 Subject: [PATCH 183/704] [BOLT] Align BranchInfo and FuncBranchData in DataAggregator::recordTrace `DataAggregator::recordTrace` serves two purposes: - Attaching LBR fallthrough ("trace") information to CFG (`getBranchInfo`), which eventually gets emitted as YAML profile. - Populating vector of offsets that gets added to `FuncBranchData`, which eventually gets emitted as fdata profile. `recordTrace` is invoked from `getFallthroughsInTrace` which checks its return status and passes on the collected vector of offsets to `doTrace`. However, if a malformed trace is passed to `recordTrace` it might partially attach the profile to CFG and exit with false, not propagating the vector of offsets to `doTrace`. This leads to a difference between fdata and yaml profile collected from the same binary and the same perf file. (Skylake LBR errata might produce such malformed traces where the last entry is duplicated, resulting in invalid fallthrough path between the last two entries). There are two ways to handle this mismatch: conservative (aligned with fdata), or aggressive (aligned with yaml). Conservative approach would discard the trace entirely, buffering the CFG updates until all fallthroughs are confirmed. Aggressive approach would apply CFG updates and return the matching fallthroughs in the vector even if the trace is invalid (doesn't correspond to a valid fallthrough path). I chose to go with the former (conservative/fdata) approach which produces more accurate profile. We can't rely on pre-filtering such traces early (in LBR sample processing) as DataAggregator is used for both perf samples and pre-aggregated perf information which loses branch stack information. Test Plan: https://github.com/rafaelauler/bolt-tests/pull/22 Reviewed By: #bolt, rafauler Differential Revision: https://reviews.llvm.org/D151614 --- bolt/include/bolt/Profile/DataAggregator.h | 8 ++--- bolt/lib/Profile/DataAggregator.cpp | 37 +++++++++++----------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index 8b53666833125..cc237a6e642b8 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -199,10 +199,10 @@ class DataAggregator : public DataReader { /// execution order. /// /// Return true if the trace is valid, false otherwise. - bool recordTrace( - BinaryFunction &BF, const LBREntry &First, const LBREntry &Second, - uint64_t Count = 1, - SmallVector, 16> *Branches = nullptr) const; + bool + recordTrace(BinaryFunction &BF, const LBREntry &First, const LBREntry &Second, + uint64_t Count, + SmallVector, 16> &Branches) const; /// Return a vector of offsets corresponding to a trace in a function /// (see recordTrace() above). diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 00ee56c31efb6..67bd2132b344c 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -838,11 +838,9 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, } bool DataAggregator::recordTrace( - BinaryFunction &BF, - const LBREntry &FirstLBR, - const LBREntry &SecondLBR, + BinaryFunction &BF, const LBREntry &FirstLBR, const LBREntry &SecondLBR, uint64_t Count, - SmallVector, 16> *Branches) const { + SmallVector, 16> &Branches) const { BinaryContext &BC = BF.getBinaryContext(); if (!BF.isSimple()) @@ -902,24 +900,27 @@ bool DataAggregator::recordTrace( return false; } - // Record fall-through jumps - BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(*NextBB); - BI.Count += Count; - - if (Branches) { - const MCInst *Instr = BB->getLastNonPseudoInstr(); - uint64_t Offset = 0; - if (Instr) - Offset = BC.MIB->getOffsetWithDefault(*Instr, 0); - else - Offset = BB->getOffset(); + const MCInst *Instr = BB->getLastNonPseudoInstr(); + uint64_t Offset = 0; + if (Instr) + Offset = BC.MIB->getOffsetWithDefault(*Instr, 0); + else + Offset = BB->getOffset(); - Branches->emplace_back(Offset, NextBB->getOffset()); - } + Branches.emplace_back(Offset, NextBB->getOffset()); BB = NextBB; } + // Record fall-through jumps + for (const auto &[FromOffset, ToOffset] : Branches) { + BinaryBasicBlock *FromBB = BF.getBasicBlockContainingOffset(FromOffset); + BinaryBasicBlock *ToBB = BF.getBasicBlockAtOffset(ToOffset); + assert(FromBB && ToBB); + BinaryBasicBlock::BinaryBranchInfo &BI = FromBB->getBranchInfo(*ToBB); + BI.Count += Count; + } + return true; } @@ -930,7 +931,7 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, uint64_t Count) const { SmallVector, 16> Res; - if (!recordTrace(BF, FirstLBR, SecondLBR, Count, &Res)) + if (!recordTrace(BF, FirstLBR, SecondLBR, Count, Res)) return std::nullopt; return Res; From 8ac084728daf5b666624621562afb6d63cc01ae3 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 8 May 2023 10:55:52 -0700 Subject: [PATCH 184/704] [NFC][ASAN] Remove redundant fields of AsanThread --- compiler-rt/lib/asan/asan_interceptors.cpp | 7 +++---- compiler-rt/lib/asan/asan_thread.cpp | 18 ++++++++---------- compiler-rt/lib/asan/asan_thread.h | 9 ++------- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp index 7aedefe81f95f..086b23ca1a647 100644 --- a/compiler-rt/lib/asan/asan_interceptors.cpp +++ b/compiler-rt/lib/asan/asan_interceptors.cpp @@ -177,9 +177,9 @@ static thread_return_t THREAD_CALLING_CONV asan_thread_start(void *arg) { SetCurrentThread(t); auto self = GetThreadSelf(); auto args = asanThreadArgRetval().GetArgs(self); - thread_return_t retval = t->ThreadStart(GetTid()); + thread_return_t retval = + t->ThreadStart(GetTid(), args.routine, args.arg_retval); asanThreadArgRetval().Finish(self, retval); - CHECK_EQ(args.arg_retval, t->get_arg()); return retval; } @@ -197,8 +197,7 @@ INTERCEPTOR(int, pthread_create, void *thread, void *attr, }(); u32 current_tid = GetCurrentTidOrInvalid(); - AsanThread *t = - AsanThread::Create(start_routine, arg, current_tid, &stack, detached); + AsanThread *t = AsanThread::Create(current_tid, &stack, detached); int result; { diff --git a/compiler-rt/lib/asan/asan_thread.cpp b/compiler-rt/lib/asan/asan_thread.cpp index f718adf5e1f73..5d0c134808b87 100644 --- a/compiler-rt/lib/asan/asan_thread.cpp +++ b/compiler-rt/lib/asan/asan_thread.cpp @@ -91,14 +91,11 @@ AsanThreadContext *GetThreadContextByTidLocked(u32 tid) { // AsanThread implementation. -AsanThread *AsanThread::Create(thread_callback_t start_routine, void *arg, - u32 parent_tid, StackTrace *stack, +AsanThread *AsanThread::Create(u32 parent_tid, StackTrace *stack, bool detached) { uptr PageSize = GetPageSizeCached(); uptr size = RoundUpTo(sizeof(AsanThread), PageSize); AsanThread *thread = (AsanThread *)MmapOrDie(size, __func__); - thread->start_routine_ = start_routine; - thread->arg_ = arg; AsanThreadContext::CreateThreadContextArgs args = {thread, stack}; asanThreadRegistry().CreateThread(0, detached, parent_tid, &args); @@ -273,22 +270,23 @@ void AsanThread::Init(const InitOptions *options) { // asan_fuchsia.c definies CreateMainThread and SetThreadStackAndTls. #if !SANITIZER_FUCHSIA -thread_return_t AsanThread::ThreadStart(tid_t os_id) { +thread_return_t AsanThread::ThreadStart(tid_t os_id, void *(*routine)(void *), + void *arg) { Init(); asanThreadRegistry().StartThread(tid(), os_id, ThreadType::Regular, nullptr); if (common_flags()->use_sigaltstack) SetAlternateSignalStack(); - if (!start_routine_) { + if (!routine) { // start_routine_ == 0 if we're on the main thread or on one of the // OS X libdispatch worker threads. But nobody is supposed to call // ThreadStart() for the worker threads. - CHECK_EQ(tid(), 0); + CHECK_EQ(tid(), kMainTid); return 0; } - thread_return_t res = start_routine_(arg_); + thread_return_t res = (*routine)(arg); // On POSIX systems we defer this to the TSD destructor. LSan will consider // the thread's memory as non-live from the moment we call Destroy(), even @@ -303,10 +301,10 @@ thread_return_t AsanThread::ThreadStart(tid_t os_id) { AsanThread *CreateMainThread() { AsanThread *main_thread = AsanThread::Create( - /* start_routine */ nullptr, /* arg */ nullptr, /* parent_tid */ kMainTid, + /* parent_tid */ kMainTid, /* stack */ nullptr, /* detached */ true); SetCurrentThread(main_thread); - main_thread->ThreadStart(internal_getpid()); + main_thread->ThreadStart(internal_getpid(), nullptr, nullptr); return main_thread; } diff --git a/compiler-rt/lib/asan/asan_thread.h b/compiler-rt/lib/asan/asan_thread.h index c131dd40d8647..b6b7705f78760 100644 --- a/compiler-rt/lib/asan/asan_thread.h +++ b/compiler-rt/lib/asan/asan_thread.h @@ -59,15 +59,14 @@ COMPILER_CHECK(sizeof(AsanThreadContext) <= 256); // AsanThread are stored in TSD and destroyed when the thread dies. class AsanThread { public: - static AsanThread *Create(thread_callback_t start_routine, void *arg, - u32 parent_tid, StackTrace *stack, bool detached); + static AsanThread *Create(u32 parent_tid, StackTrace *stack, bool detached); static void TSDDtor(void *tsd); void Destroy(); struct InitOptions; void Init(const InitOptions *options = nullptr); - thread_return_t ThreadStart(tid_t os_id); + thread_return_t ThreadStart(tid_t os_id, void *(*routine)(void *), void *arg); uptr stack_top(); uptr stack_bottom(); @@ -130,8 +129,6 @@ class AsanThread { void *extra_spill_area() { return &extra_spill_area_; } - void *get_arg() const { return arg_; } - private: // NOTE: There is no AsanThread constructor. It is allocated // via mmap() and *must* be valid in zero-initialized state. @@ -148,8 +145,6 @@ class AsanThread { StackBounds GetStackBounds() const; AsanThreadContext *context_; - thread_callback_t start_routine_; - void *arg_; uptr stack_top_; uptr stack_bottom_; From d146fc8fba34c8e41b8e0e161ea99017eafebfad Mon Sep 17 00:00:00 2001 From: yijia1212 Date: Wed, 31 May 2023 01:37:14 +0000 Subject: [PATCH 185/704] add missing dependency for TosaToLinalg --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 9fe4bf4d27d42..f6009cf927d45 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -9790,11 +9790,13 @@ cc_library( ], deps = [ ":ArithDialect", + ":ArithUtils", ":ConversionPassIncGen", ":DialectUtils", ":FuncDialect", ":IR", ":LinalgDialect", + ":LinalgUtils", ":MathDialect", ":Pass", ":SCFDialect", From 0c05128aeaf8611f835ee59f96467100e1bec6dc Mon Sep 17 00:00:00 2001 From: "Manna, Soumi" Date: Tue, 30 May 2023 18:46:40 -0700 Subject: [PATCH 186/704] [NFC][CLANG] Fix nullptr dereference issue in Type::getRVVEltType() This patch uses castAs instead of getAs which will assert if the type doesn't match in clang::Type::getRVVEltType(clang::ASTContext const &) Reviewed By: erichkeane Differential Revision: https://reviews.llvm.org/D151721 --- clang/lib/AST/Type.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index bde88653417d9..63a5159ba7035 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2463,7 +2463,7 @@ bool Type::isRVVVLSBuiltinType() const { QualType Type::getRVVEltType(const ASTContext &Ctx) const { assert(isRVVVLSBuiltinType() && "unsupported type!"); - const BuiltinType *BTy = getAs(); + const BuiltinType *BTy = castAs(); return Ctx.getBuiltinVectorTypeInfo(BTy).ElementType; } From 62307f6d4e449e66a662e4ed79547e616a6f41ca Mon Sep 17 00:00:00 2001 From: Jianjian GUAN Date: Tue, 30 May 2023 12:31:36 +0800 Subject: [PATCH 187/704] [RISCV] Fix selection for some unmasked vp SDNode. Make unmasked vp_ftrunc, vp_fceil, vp_floor and vp_fround select to unmasked instruction. Reviewed By: frasercrmck Differential Revision: https://reviews.llvm.org/D151676 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 + .../RISCV/rvv/fixed-vectors-ceil-vp.ll | 192 +++++------------- .../RISCV/rvv/fixed-vectors-floor-vp.ll | 192 +++++------------- .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll | 179 +++++----------- .../RISCV/rvv/fixed-vectors-rint-vp.ll | 192 +++++------------- .../RISCV/rvv/fixed-vectors-round-vp.ll | 192 +++++------------- .../RISCV/rvv/fixed-vectors-roundeven-vp.ll | 192 +++++------------- .../RISCV/rvv/fixed-vectors-roundtozero-vp.ll | 192 +++++------------- 8 files changed, 360 insertions(+), 974 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 229345159280d..c327bc51d771f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2477,6 +2477,9 @@ lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG, SDValue Mask, VL; if (Op->isVPOpcode()) { Mask = Op.getOperand(1); + if (VT.isFixedLengthVector()) + Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG, + Subtarget); VL = Op.getOperand(2); } else { std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll index 3baed1601a29b..d11f49909a8b0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll @@ -32,14 +32,10 @@ define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -80,14 +76,10 @@ define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -128,14 +120,10 @@ define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -178,21 +166,15 @@ define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI7_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -226,16 +208,12 @@ define <2 x float> @vp_ceil_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %evl define <2 x float> @vp_ceil_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -274,16 +252,12 @@ define <4 x float> @vp_ceil_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl define <4 x float> @vp_ceil_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -324,23 +298,17 @@ define <8 x float> @vp_ceil_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl define <8 x float> @vp_ceil_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -376,23 +344,17 @@ define <16 x float> @vp_ceil_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext define <16 x float> @vp_ceil_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -428,14 +390,10 @@ define <2 x double> @vp_ceil_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI17_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -478,21 +436,15 @@ define <4 x double> @vp_ceil_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI19_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -530,21 +482,15 @@ define <8 x double> @vp_ceil_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI21_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -582,21 +528,15 @@ define <15 x double> @vp_ceil_v15f64_unmasked(<15 x double> %va, i32 zeroext %ev ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI23_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -634,21 +574,15 @@ define <16 x double> @vp_ceil_v16f64_unmasked(<16 x double> %va, i32 zeroext %ev ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI25_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -734,60 +668,36 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_ceil_v32f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vmset.m v1 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB27_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI27_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v2, v1 -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a1, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll index 6657184f54e4d..4791aadc1088b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll @@ -32,14 +32,10 @@ define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -80,14 +76,10 @@ define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -128,14 +120,10 @@ define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -178,21 +166,15 @@ define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI7_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -226,16 +208,12 @@ define <2 x float> @vp_floor_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %ev define <2 x float> @vp_floor_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -274,16 +252,12 @@ define <4 x float> @vp_floor_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %ev define <4 x float> @vp_floor_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -324,23 +298,17 @@ define <8 x float> @vp_floor_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %ev define <8 x float> @vp_floor_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -376,23 +344,17 @@ define <16 x float> @vp_floor_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext define <16 x float> @vp_floor_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -428,14 +390,10 @@ define <2 x double> @vp_floor_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI17_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -478,21 +436,15 @@ define <4 x double> @vp_floor_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI19_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -530,21 +482,15 @@ define <8 x double> @vp_floor_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI21_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -582,21 +528,15 @@ define <15 x double> @vp_floor_v15f64_unmasked(<15 x double> %va, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI23_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -634,21 +574,15 @@ define <16 x double> @vp_floor_v16f64_unmasked(<16 x double> %va, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI25_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -734,60 +668,36 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_floor_v32f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vmset.m v1 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB27_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI27_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v2, v1 -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a1, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll index 643be941f9be8..14c1cd072f3a1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll @@ -32,14 +32,10 @@ define <2 x half> @vp_nearbyint_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: fsflags a0 @@ -80,14 +76,10 @@ define <4 x half> @vp_nearbyint_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: fsflags a0 @@ -128,14 +120,10 @@ define <8 x half> @vp_nearbyint_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: fsflags a0 @@ -178,21 +166,15 @@ define <16 x half> @vp_nearbyint_v16f16_unmasked(<16 x half> %va, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI7_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -226,16 +208,12 @@ define <2 x float> @vp_nearbyint_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext define <2 x float> @vp_nearbyint_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: fsflags a0 @@ -274,16 +252,12 @@ define <4 x float> @vp_nearbyint_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext define <4 x float> @vp_nearbyint_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: fsflags a0 @@ -324,23 +298,17 @@ define <8 x float> @vp_nearbyint_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext define <8 x float> @vp_nearbyint_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -376,23 +344,17 @@ define <16 x float> @vp_nearbyint_v16f32(<16 x float> %va, <16 x i1> %m, i32 zer define <16 x float> @vp_nearbyint_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -428,14 +390,10 @@ define <2 x double> @vp_nearbyint_v2f64_unmasked(<2 x double> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI17_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: fsflags a0 @@ -478,21 +436,15 @@ define <4 x double> @vp_nearbyint_v4f64_unmasked(<4 x double> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI19_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -530,21 +482,15 @@ define <8 x double> @vp_nearbyint_v8f64_unmasked(<8 x double> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI21_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -582,21 +528,15 @@ define <15 x double> @vp_nearbyint_v15f64_unmasked(<15 x double> %va, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI23_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -634,21 +574,15 @@ define <16 x double> @vp_nearbyint_v16f64_unmasked(<16 x double> %va, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI25_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: fsflags a0 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -710,9 +644,7 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v32f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vmset.m v1 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB27_2 ; CHECK-NEXT: # %bb.1: @@ -721,14 +653,9 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex ; CHECK-NEXT: lui a2, %hi(.LCPI27_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v2, v1 -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: frflags a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: fsflags a1 @@ -739,13 +666,9 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: fsflags a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll index 074062203532a..6b69a47167af5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll @@ -30,13 +30,9 @@ define <2 x half> @vp_rint_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu @@ -74,13 +70,9 @@ define <4 x half> @vp_rint_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu @@ -118,13 +110,9 @@ define <8 x half> @vp_rint_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu @@ -164,19 +152,13 @@ define <16 x half> @vp_rint_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI7_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -208,15 +190,11 @@ define <2 x float> @vp_rint_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %evl define <2 x float> @vp_rint_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu @@ -252,15 +230,11 @@ define <4 x float> @vp_rint_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %evl define <4 x float> @vp_rint_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu @@ -298,21 +272,15 @@ define <8 x float> @vp_rint_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %evl define <8 x float> @vp_rint_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -346,21 +314,15 @@ define <16 x float> @vp_rint_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext define <16 x float> @vp_rint_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -394,13 +356,9 @@ define <2 x double> @vp_rint_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI17_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu @@ -440,19 +398,13 @@ define <4 x double> @vp_rint_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI19_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -488,19 +440,13 @@ define <8 x double> @vp_rint_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI21_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -536,19 +482,13 @@ define <15 x double> @vp_rint_v15f64_unmasked(<15 x double> %va, i32 zeroext %ev ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI23_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -584,19 +524,13 @@ define <16 x double> @vp_rint_v16f64_unmasked(<16 x double> %va, i32 zeroext %ev ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI25_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -667,56 +601,32 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v32f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vmset.m v1 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB27_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI27_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v2, v1 -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 +; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll index fb859506e0616..5d1aa27474351 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll @@ -32,14 +32,10 @@ define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -80,14 +76,10 @@ define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -128,14 +120,10 @@ define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -178,21 +166,15 @@ define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI7_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -226,16 +208,12 @@ define <2 x float> @vp_round_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext %ev define <2 x float> @vp_round_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -274,16 +252,12 @@ define <4 x float> @vp_round_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext %ev define <4 x float> @vp_round_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -324,23 +298,17 @@ define <8 x float> @vp_round_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext %ev define <8 x float> @vp_round_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -376,23 +344,17 @@ define <16 x float> @vp_round_v16f32(<16 x float> %va, <16 x i1> %m, i32 zeroext define <16 x float> @vp_round_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -428,14 +390,10 @@ define <2 x double> @vp_round_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI17_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -478,21 +436,15 @@ define <4 x double> @vp_round_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI19_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -530,21 +482,15 @@ define <8 x double> @vp_round_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI21_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -582,21 +528,15 @@ define <15 x double> @vp_round_v15f64_unmasked(<15 x double> %va, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI23_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -634,21 +574,15 @@ define <16 x double> @vp_round_v16f64_unmasked(<16 x double> %va, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI25_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -734,60 +668,36 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_round_v32f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vmset.m v1 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB27_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI27_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v2, v1 -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a1, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll index 9f7029e8b0390..88778ce0365b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll @@ -32,14 +32,10 @@ define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -80,14 +76,10 @@ define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -128,14 +120,10 @@ define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -178,21 +166,15 @@ define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI7_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -226,16 +208,12 @@ define <2 x float> @vp_roundeven_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext define <2 x float> @vp_roundeven_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -274,16 +252,12 @@ define <4 x float> @vp_roundeven_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext define <4 x float> @vp_roundeven_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -324,23 +298,17 @@ define <8 x float> @vp_roundeven_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext define <8 x float> @vp_roundeven_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -376,23 +344,17 @@ define <16 x float> @vp_roundeven_v16f32(<16 x float> %va, <16 x i1> %m, i32 zer define <16 x float> @vp_roundeven_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -428,14 +390,10 @@ define <2 x double> @vp_roundeven_v2f64_unmasked(<2 x double> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI17_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -478,21 +436,15 @@ define <4 x double> @vp_roundeven_v4f64_unmasked(<4 x double> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI19_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -530,21 +482,15 @@ define <8 x double> @vp_roundeven_v8f64_unmasked(<8 x double> %va, i32 zeroext % ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI21_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -582,21 +528,15 @@ define <15 x double> @vp_roundeven_v15f64_unmasked(<15 x double> %va, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI23_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -634,21 +574,15 @@ define <16 x double> @vp_roundeven_v16f64_unmasked(<16 x double> %va, i32 zeroex ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI25_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -734,60 +668,36 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundeven_v32f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vmset.m v1 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB27_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI27_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v2, v1 -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a1, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll index b9ecf8eca9109..33e6bf60d7053 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll @@ -32,14 +32,10 @@ define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %ev ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -80,14 +76,10 @@ define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %ev ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI3_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -128,14 +120,10 @@ define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %ev ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI5_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -178,21 +166,15 @@ define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI7_0) ; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -226,16 +208,12 @@ define <2 x float> @vp_roundtozero_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroe define <2 x float> @vp_roundtozero_v2f32_unmasked(<2 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -274,16 +252,12 @@ define <4 x float> @vp_roundtozero_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroe define <4 x float> @vp_roundtozero_v4f32_unmasked(<4 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: vfabs.v v9, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -324,23 +298,17 @@ define <8 x float> @vp_roundtozero_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroe define <8 x float> @vp_roundtozero_v8f32_unmasked(<8 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: vfabs.v v10, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -376,23 +344,17 @@ define <16 x float> @vp_roundtozero_v16f32(<16 x float> %va, <16 x i1> %m, i32 z define <16 x float> @vp_roundtozero_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t +; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: lui a0, 307200 ; CHECK-NEXT: fmv.w.x fa5, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -428,14 +390,10 @@ define <2 x double> @vp_roundtozero_v2f64_unmasked(<2 x double> %va, i32 zeroext ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI17_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmset.m v0 ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t +; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -478,21 +436,15 @@ define <4 x double> @vp_roundtozero_v4f64_unmasked(<4 x double> %va, i32 zeroext ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI19_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmset.m v10 ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t +; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t +; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <4 x i1> poison, i1 true, i32 0 %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer @@ -530,21 +482,15 @@ define <8 x double> @vp_roundtozero_v8f64_unmasked(<8 x double> %va, i32 zeroext ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI21_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmset.m v12 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t +; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t +; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer @@ -582,21 +528,15 @@ define <15 x double> @vp_roundtozero_v15f64_unmasked(<15 x double> %va, i32 zero ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI23_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <15 x i1> poison, i1 true, i32 0 %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer @@ -634,21 +574,15 @@ define <16 x double> @vp_roundtozero_v16f64_unmasked(<16 x double> %va, i32 zero ; CHECK: # %bb.0: ; CHECK-NEXT: lui a1, %hi(.LCPI25_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmset.m v16 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t +; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t ; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t +; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t ; CHECK-NEXT: ret %head = insertelement <16 x i1> poison, i1 true, i32 0 %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer @@ -734,60 +668,36 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_roundtozero_v32f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vmset.m v1 ; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bltu a0, a2, .LBB27_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: lui a2, %hi(.LCPI27_0) ; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmv1r.v v2, v1 -; CHECK-NEXT: vmflt.vf v2, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v8 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a1, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t ; CHECK-NEXT: fsrm a1 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t ; CHECK-NEXT: addi a1, a0, -16 ; CHECK-NEXT: sltu a0, a0, a1 ; CHECK-NEXT: addi a0, a0, -1 ; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v1, v24, fa5, v0.t +; CHECK-NEXT: vfabs.v v24, v16 +; CHECK-NEXT: vmflt.vf v0, v24, fa5 ; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t ; CHECK-NEXT: fsrm a0 ; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret %head = insertelement <32 x i1> poison, i1 true, i32 0 %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer From cf236a037fd846d12131809ed07766fceec65fdc Mon Sep 17 00:00:00 2001 From: "Manna, Soumi" Date: Tue, 30 May 2023 18:58:44 -0700 Subject: [PATCH 188/704] [NFC][CLANG] Fix nullptr dereference issue in DeduceTemplateArgumentsByTypeMatch() DeduceTemplateArgumentsByTypeMatch() returns null value which is dereferenced without checking since getAsIncompleteArrayType() returns nullptr and we are dereferencing null pointer value for S.Context->getAsIncompleteArrayType(P) when calling getElementType(). This patch adds an assert. Reviewed By: erichkeane Differential Revision: https://reviews.llvm.org/D151529 --- clang/lib/Sema/SemaTemplateDeduction.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index db72b8b3089e6..27a8a5990b28d 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -1703,10 +1703,12 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch( if (!IAA) return Sema::TDK_NonDeducedMismatch; + const auto *IAP = S.Context.getAsIncompleteArrayType(P); + assert(IAP && "Template parameter not of incomplete array type"); + return DeduceTemplateArgumentsByTypeMatch( - S, TemplateParams, - S.Context.getAsIncompleteArrayType(P)->getElementType(), - IAA->getElementType(), Info, Deduced, TDF & TDF_IgnoreQualifiers); + S, TemplateParams, IAP->getElementType(), IAA->getElementType(), Info, + Deduced, TDF & TDF_IgnoreQualifiers); } // T [integer-constant] From a04f1d2740222598f7cef0e6a0e27c4200a00f94 Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Tue, 30 May 2023 19:15:41 -0700 Subject: [PATCH 189/704] [MachO] Add the relocation types for auth pointers (chained fixups). --- llvm/include/llvm/BinaryFormat/MachO.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h index d51af31fb14ff..474480f9f0c18 100644 --- a/llvm/include/llvm/BinaryFormat/MachO.h +++ b/llvm/include/llvm/BinaryFormat/MachO.h @@ -473,6 +473,8 @@ enum RelocationInfoType { ARM64_RELOC_TLVP_LOAD_PAGEOFF12 = 9, // Must be followed by ARM64_RELOC_PAGE21 or ARM64_RELOC_PAGEOFF12. ARM64_RELOC_ADDEND = 10, + // An authenticated pointer. + ARM64_RELOC_AUTHENTICATED_POINTER = 11, // Constant values for the r_type field in an x86_64 architecture // llvm::MachO::relocation_info or llvm::MachO::scattered_relocation_info From afb73f7a913ec8e7e8704afe18784571f320ebf6 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 30 May 2023 19:16:29 -0700 Subject: [PATCH 190/704] Revert "[NFC][ASAN] Remove redundant fields of AsanThread" Breaks Windows. This reverts commit 8ac084728daf5b666624621562afb6d63cc01ae3. --- compiler-rt/lib/asan/asan_interceptors.cpp | 7 ++++--- compiler-rt/lib/asan/asan_thread.cpp | 18 ++++++++++-------- compiler-rt/lib/asan/asan_thread.h | 9 +++++++-- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp index 086b23ca1a647..7aedefe81f95f 100644 --- a/compiler-rt/lib/asan/asan_interceptors.cpp +++ b/compiler-rt/lib/asan/asan_interceptors.cpp @@ -177,9 +177,9 @@ static thread_return_t THREAD_CALLING_CONV asan_thread_start(void *arg) { SetCurrentThread(t); auto self = GetThreadSelf(); auto args = asanThreadArgRetval().GetArgs(self); - thread_return_t retval = - t->ThreadStart(GetTid(), args.routine, args.arg_retval); + thread_return_t retval = t->ThreadStart(GetTid()); asanThreadArgRetval().Finish(self, retval); + CHECK_EQ(args.arg_retval, t->get_arg()); return retval; } @@ -197,7 +197,8 @@ INTERCEPTOR(int, pthread_create, void *thread, void *attr, }(); u32 current_tid = GetCurrentTidOrInvalid(); - AsanThread *t = AsanThread::Create(current_tid, &stack, detached); + AsanThread *t = + AsanThread::Create(start_routine, arg, current_tid, &stack, detached); int result; { diff --git a/compiler-rt/lib/asan/asan_thread.cpp b/compiler-rt/lib/asan/asan_thread.cpp index 5d0c134808b87..f718adf5e1f73 100644 --- a/compiler-rt/lib/asan/asan_thread.cpp +++ b/compiler-rt/lib/asan/asan_thread.cpp @@ -91,11 +91,14 @@ AsanThreadContext *GetThreadContextByTidLocked(u32 tid) { // AsanThread implementation. -AsanThread *AsanThread::Create(u32 parent_tid, StackTrace *stack, +AsanThread *AsanThread::Create(thread_callback_t start_routine, void *arg, + u32 parent_tid, StackTrace *stack, bool detached) { uptr PageSize = GetPageSizeCached(); uptr size = RoundUpTo(sizeof(AsanThread), PageSize); AsanThread *thread = (AsanThread *)MmapOrDie(size, __func__); + thread->start_routine_ = start_routine; + thread->arg_ = arg; AsanThreadContext::CreateThreadContextArgs args = {thread, stack}; asanThreadRegistry().CreateThread(0, detached, parent_tid, &args); @@ -270,23 +273,22 @@ void AsanThread::Init(const InitOptions *options) { // asan_fuchsia.c definies CreateMainThread and SetThreadStackAndTls. #if !SANITIZER_FUCHSIA -thread_return_t AsanThread::ThreadStart(tid_t os_id, void *(*routine)(void *), - void *arg) { +thread_return_t AsanThread::ThreadStart(tid_t os_id) { Init(); asanThreadRegistry().StartThread(tid(), os_id, ThreadType::Regular, nullptr); if (common_flags()->use_sigaltstack) SetAlternateSignalStack(); - if (!routine) { + if (!start_routine_) { // start_routine_ == 0 if we're on the main thread or on one of the // OS X libdispatch worker threads. But nobody is supposed to call // ThreadStart() for the worker threads. - CHECK_EQ(tid(), kMainTid); + CHECK_EQ(tid(), 0); return 0; } - thread_return_t res = (*routine)(arg); + thread_return_t res = start_routine_(arg_); // On POSIX systems we defer this to the TSD destructor. LSan will consider // the thread's memory as non-live from the moment we call Destroy(), even @@ -301,10 +303,10 @@ thread_return_t AsanThread::ThreadStart(tid_t os_id, void *(*routine)(void *), AsanThread *CreateMainThread() { AsanThread *main_thread = AsanThread::Create( - /* parent_tid */ kMainTid, + /* start_routine */ nullptr, /* arg */ nullptr, /* parent_tid */ kMainTid, /* stack */ nullptr, /* detached */ true); SetCurrentThread(main_thread); - main_thread->ThreadStart(internal_getpid(), nullptr, nullptr); + main_thread->ThreadStart(internal_getpid()); return main_thread; } diff --git a/compiler-rt/lib/asan/asan_thread.h b/compiler-rt/lib/asan/asan_thread.h index b6b7705f78760..c131dd40d8647 100644 --- a/compiler-rt/lib/asan/asan_thread.h +++ b/compiler-rt/lib/asan/asan_thread.h @@ -59,14 +59,15 @@ COMPILER_CHECK(sizeof(AsanThreadContext) <= 256); // AsanThread are stored in TSD and destroyed when the thread dies. class AsanThread { public: - static AsanThread *Create(u32 parent_tid, StackTrace *stack, bool detached); + static AsanThread *Create(thread_callback_t start_routine, void *arg, + u32 parent_tid, StackTrace *stack, bool detached); static void TSDDtor(void *tsd); void Destroy(); struct InitOptions; void Init(const InitOptions *options = nullptr); - thread_return_t ThreadStart(tid_t os_id, void *(*routine)(void *), void *arg); + thread_return_t ThreadStart(tid_t os_id); uptr stack_top(); uptr stack_bottom(); @@ -129,6 +130,8 @@ class AsanThread { void *extra_spill_area() { return &extra_spill_area_; } + void *get_arg() const { return arg_; } + private: // NOTE: There is no AsanThread constructor. It is allocated // via mmap() and *must* be valid in zero-initialized state. @@ -145,6 +148,8 @@ class AsanThread { StackBounds GetStackBounds() const; AsanThreadContext *context_; + thread_callback_t start_routine_; + void *arg_; uptr stack_top_; uptr stack_bottom_; From 319d5d99cadfee2a4cd4985db98399b99fb213fd Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 30 May 2023 22:16:29 -0400 Subject: [PATCH 191/704] [NFC][OpenMP] Remove unused variable `new_iv_saved` in `openmp/runtime/src/kmp_collapse.cpp` --- openmp/runtime/src/kmp_collapse.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp index cbfc52567b43f..8d0ed0e945c06 100644 --- a/openmp/runtime/src/kmp_collapse.cpp +++ b/openmp/runtime/src/kmp_collapse.cpp @@ -1135,10 +1135,6 @@ bool kmp_calc_original_ivs_for_chunk_end( kmp_iterations_t iterations = (kmp_iterations_t)__kmp_allocate(sizeof(kmp_loop_nest_iv_t) * n); -#if defined(KMP_DEBUG) - auto new_iv_saved = new_iv; -#endif - // First, calc corresponding iteration in every modified loop: for (kmp_index_t ind = n; ind > 0;) { --ind; From 6114579b609a560c504d80337d992223a095a685 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 30 May 2023 19:19:04 -0700 Subject: [PATCH 192/704] [NFC][sanitizer] Extend ArrayRef --- .../lib/sanitizer_common/sanitizer_common.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h index afd1a786d5d33..d43e066ca1b9b 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h @@ -1085,12 +1085,19 @@ class ArrayRef { ArrayRef() {} ArrayRef(T *begin, T *end) : begin_(begin), end_(end) {} - T *begin() { return begin_; } - T *end() { return end_; } + template + ArrayRef(const C &src) : begin_(src.begin()), end_(src.end()) {} + + const T *begin() const { return begin_; } + const T *end() const { return end_; } + + bool empty() const { return begin_ == end_; } + + uptr size() const { return end_ - begin_; } private: - T *begin_ = nullptr; - T *end_ = nullptr; + const T *begin_ = nullptr; + const T *end_ = nullptr; }; } // namespace __sanitizer From 5f2ce1981593d6e877d4879bdb46152cf6a444db Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Tue, 30 May 2023 19:24:24 -0700 Subject: [PATCH 193/704] [MachO] Add comments to explain the semantics of chained fixups structures. --- llvm/include/llvm/BinaryFormat/MachO.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h index 474480f9f0c18..8f47fabf286b5 100644 --- a/llvm/include/llvm/BinaryFormat/MachO.h +++ b/llvm/include/llvm/BinaryFormat/MachO.h @@ -1039,8 +1039,8 @@ enum { // Values for dyld_chained_starts_in_segment::page_start. enum { DYLD_CHAINED_PTR_START_NONE = 0xFFFF, - DYLD_CHAINED_PTR_START_MULTI = 0x8000, - DYLD_CHAINED_PTR_START_LAST = 0x8000, + DYLD_CHAINED_PTR_START_MULTI = 0x8000, // page which has multiple starts + DYLD_CHAINED_PTR_START_LAST = 0x8000, // last chain_start for a given page }; // Values for dyld_chained_starts_in_segment::pointer_format. From 7e99d318fddb8b02bfbd7cfcd0e44281052b382e Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Wed, 31 May 2023 10:33:52 +0800 Subject: [PATCH 194/704] [MachineScheduler] Disable default copy ctor and copy assignment operator for SchedBoundary class SchedBoundary manages resources such as dynamically allocated memory, it's generally a good practice to either implement a custom copy constructor or disable the default one. Reviewed By: pengfei Differential Revision: https://reviews.llvm.org/D151686 --- llvm/include/llvm/CodeGen/MachineScheduler.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h index 06e1a8292e519..deb4396d1cdc7 100644 --- a/llvm/include/llvm/CodeGen/MachineScheduler.h +++ b/llvm/include/llvm/CodeGen/MachineScheduler.h @@ -722,7 +722,8 @@ class SchedBoundary { Available(ID, Name+".A"), Pending(ID << LogMaxQID, Name+".P") { reset(); } - + SchedBoundary &operator=(const SchedBoundary &other) = delete; + SchedBoundary(const SchedBoundary &other) = delete; ~SchedBoundary(); void reset(); From d5001e4f0daa19d41da8715e511acaa125cef0a4 Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Wed, 31 May 2023 10:34:55 +0800 Subject: [PATCH 195/704] [MC] Disable default copy ctor and copy assignment operator for CodeViewContext class CodeViewContext manages resources such as dynamically allocated memory, it's generally a good practice to either implement a custom copy constructor or disable the default one. Reviewed By: skan Differential Revision: https://reviews.llvm.org/D151695 --- llvm/include/llvm/MC/MCCodeView.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/include/llvm/MC/MCCodeView.h b/llvm/include/llvm/MC/MCCodeView.h index 3d15c4009e438..3e997b1be3b8b 100644 --- a/llvm/include/llvm/MC/MCCodeView.h +++ b/llvm/include/llvm/MC/MCCodeView.h @@ -146,6 +146,9 @@ class CodeViewContext { CodeViewContext(); ~CodeViewContext(); + CodeViewContext &operator=(const CodeViewContext &other) = delete; + CodeViewContext(const CodeViewContext &other) = delete; + bool isValidFileNumber(unsigned FileNumber) const; bool addFile(MCStreamer &OS, unsigned FileNumber, StringRef Filename, ArrayRef ChecksumBytes, uint8_t ChecksumKind); From 0442d08fdb173d89b0779d32eb929957a344f5e6 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Tue, 9 May 2023 14:05:55 -0400 Subject: [PATCH 196/704] [clang][Sema] Improve diagnostics for auto return type Currently when clang fails to deduce auto return type of a function, it does not emit any notes about why it fails. This causes difficulty for users to fix such errors. Actually, clang already generates the information for emitting notes about the failed deduction. There is a TODO for actually emitting them. This patch tries to implement the TODO. Basically it passes the failed template specialization candidate set from the point of specialization failure back to the point where the deduction starts. It is not comprehensive but would be a start for further improvement. Reviewed by: Richard Smith, Matheus Izvekov Differential Revision: https://reviews.llvm.org/D150212 Fixes: SWDEV-354278 --- clang/include/clang/Sema/Sema.h | 18 +++++----- clang/lib/Sema/SemaOverload.cpp | 18 +++++----- clang/lib/Sema/SemaStmt.cpp | 16 +++++++-- clang/lib/Sema/SemaTemplateDeduction.cpp | 41 +++++++++++++---------- clang/test/SemaCXX/auto-type-from-cxx.cpp | 18 ++++++++++ 5 files changed, 72 insertions(+), 39 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 012a3aa93fcdc..be93f8a116c11 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -4153,10 +4153,9 @@ class Sema final { bool resolveAndFixAddressOfSingleOverloadCandidate( ExprResult &SrcExpr, bool DoFunctionPointerConversion = false); - FunctionDecl * - ResolveSingleFunctionTemplateSpecialization(OverloadExpr *ovl, - bool Complain = false, - DeclAccessPair *Found = nullptr); + FunctionDecl *ResolveSingleFunctionTemplateSpecialization( + OverloadExpr *ovl, bool Complain = false, DeclAccessPair *Found = nullptr, + TemplateSpecCandidateSet *FailedTSC = nullptr); bool ResolveAndFixSingleFunctionTemplateSpecialization( ExprResult &SrcExpr, bool DoFunctionPointerConversion = false, @@ -9140,11 +9139,12 @@ class Sema final { TypeSourceInfo *ReplaceAutoTypeSourceInfo(TypeSourceInfo *TypeWithAuto, QualType Replacement); - TemplateDeductionResult DeduceAutoType(TypeLoc AutoTypeLoc, Expr *Initializer, - QualType &Result, - sema::TemplateDeductionInfo &Info, - bool DependentDeduction = false, - bool IgnoreConstraints = false); + TemplateDeductionResult + DeduceAutoType(TypeLoc AutoTypeLoc, Expr *Initializer, QualType &Result, + sema::TemplateDeductionInfo &Info, + bool DependentDeduction = false, + bool IgnoreConstraints = false, + TemplateSpecCandidateSet *FailedTSC = nullptr); void DiagnoseAutoDeductionFailure(VarDecl *VDecl, Expr *Init); bool DeduceReturnType(FunctionDecl *FD, SourceLocation Loc, bool Diagnose = true); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 5308934ed1e3b..71359f13d3a4e 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -12797,10 +12797,9 @@ bool Sema::resolveAndFixAddressOfSingleOverloadCandidate( /// /// If no template-ids are found, no diagnostics are emitted and NULL is /// returned. -FunctionDecl * -Sema::ResolveSingleFunctionTemplateSpecialization(OverloadExpr *ovl, - bool Complain, - DeclAccessPair *FoundResult) { +FunctionDecl *Sema::ResolveSingleFunctionTemplateSpecialization( + OverloadExpr *ovl, bool Complain, DeclAccessPair *FoundResult, + TemplateSpecCandidateSet *FailedTSC) { // C++ [over.over]p1: // [...] [Note: any redundant set of parentheses surrounding the // overloaded function name is ignored (5.1). ] @@ -12814,7 +12813,6 @@ Sema::ResolveSingleFunctionTemplateSpecialization(OverloadExpr *ovl, TemplateArgumentListInfo ExplicitTemplateArgs; ovl->copyTemplateArgumentsInto(ExplicitTemplateArgs); - TemplateSpecCandidateSet FailedCandidates(ovl->getNameLoc()); // Look through all of the overloaded functions, searching for one // whose type matches exactly. @@ -12837,16 +12835,16 @@ Sema::ResolveSingleFunctionTemplateSpecialization(OverloadExpr *ovl, // function template specialization, which is added to the set of // overloaded functions considered. FunctionDecl *Specialization = nullptr; - TemplateDeductionInfo Info(FailedCandidates.getLocation()); + TemplateDeductionInfo Info(ovl->getNameLoc()); if (TemplateDeductionResult Result = DeduceTemplateArguments(FunctionTemplate, &ExplicitTemplateArgs, Specialization, Info, /*IsAddressOfFunction*/true)) { // Make a note of the failed deduction for diagnostics. - // TODO: Actually use the failed-deduction info? - FailedCandidates.addCandidate() - .set(I.getPair(), FunctionTemplate->getTemplatedDecl(), - MakeDeductionFailureInfo(Context, Result, Info)); + if (FailedTSC) + FailedTSC->addCandidate().set( + I.getPair(), FunctionTemplate->getTemplatedDecl(), + MakeDeductionFailureInfo(Context, Result, Info)); continue; } diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 7daebbd914024..2c9a17a1fab68 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -3825,9 +3825,18 @@ bool Sema::DeduceFunctionTypeFromReturnExpr(FunctionDecl *FD, { // Otherwise, [...] deduce a value for U using the rules of template // argument deduction. - TemplateDeductionInfo Info(RetExpr->getExprLoc()); - TemplateDeductionResult Res = - DeduceAutoType(OrigResultType, RetExpr, Deduced, Info); + auto RetExprLoc = RetExpr->getExprLoc(); + TemplateDeductionInfo Info(RetExprLoc); + SourceLocation TemplateSpecLoc; + if (RetExpr->getType() == Context.OverloadTy) { + auto FindResult = OverloadExpr::find(RetExpr); + if (FindResult.Expression) + TemplateSpecLoc = FindResult.Expression->getNameLoc(); + } + TemplateSpecCandidateSet FailedTSC(TemplateSpecLoc); + TemplateDeductionResult Res = DeduceAutoType( + OrigResultType, RetExpr, Deduced, Info, /*DependentDeduction=*/false, + /*IgnoreConstraints=*/false, &FailedTSC); if (Res != TDK_Success && FD->isInvalidDecl()) return true; switch (Res) { @@ -3853,6 +3862,7 @@ bool Sema::DeduceFunctionTypeFromReturnExpr(FunctionDecl *FD, default: Diag(RetExpr->getExprLoc(), diag::err_auto_fn_deduction_failure) << OrigResultType.getType() << RetExpr->getType(); + FailedTSC.NoteCandidates(*this, RetExprLoc); return true; } } diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 27a8a5990b28d..b3dc61a74364a 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -3751,7 +3751,8 @@ static QualType GetTypeOfFunction(Sema &S, const OverloadExpr::FindResult &R, static QualType ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams, Expr *Arg, QualType ParamType, - bool ParamWasReference) { + bool ParamWasReference, + TemplateSpecCandidateSet *FailedTSC = nullptr) { OverloadExpr::FindResult R = OverloadExpr::find(Arg); @@ -3773,8 +3774,10 @@ ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams, !ParamType->isMemberFunctionPointerType()) { if (Ovl->hasExplicitTemplateArgs()) { // But we can still look for an explicit specialization. - if (FunctionDecl *ExplicitSpec - = S.ResolveSingleFunctionTemplateSpecialization(Ovl)) + if (FunctionDecl *ExplicitSpec = + S.ResolveSingleFunctionTemplateSpecialization( + Ovl, /*Complain=*/false, + /*FoundDeclAccessPair=*/nullptr, FailedTSC)) return GetTypeOfFunction(S, R, ExplicitSpec); } @@ -3856,7 +3859,8 @@ ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams, /// overloaded function set that could not be resolved. static bool AdjustFunctionParmAndArgTypesForDeduction( Sema &S, TemplateParameterList *TemplateParams, unsigned FirstInnerIndex, - QualType &ParamType, QualType &ArgType, Expr *Arg, unsigned &TDF) { + QualType &ParamType, QualType &ArgType, Expr *Arg, unsigned &TDF, + TemplateSpecCandidateSet *FailedTSC = nullptr) { // C++0x [temp.deduct.call]p3: // If P is a cv-qualified type, the top level cv-qualifiers of P's type // are ignored for type deduction. @@ -3873,9 +3877,8 @@ static bool AdjustFunctionParmAndArgTypesForDeduction( // but there are sometimes special circumstances. Typically // involving a template-id-expr. if (ArgType == S.Context.OverloadTy) { - ArgType = ResolveOverloadForDeduction(S, TemplateParams, - Arg, ParamType, - ParamRefType != nullptr); + ArgType = ResolveOverloadForDeduction(S, TemplateParams, Arg, ParamType, + ParamRefType != nullptr, FailedTSC); if (ArgType.isNull()) return true; } @@ -3953,7 +3956,8 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument( QualType ParamType, Expr *Arg, TemplateDeductionInfo &Info, SmallVectorImpl &Deduced, SmallVectorImpl &OriginalCallArgs, - bool DecomposedParam, unsigned ArgIdx, unsigned TDF); + bool DecomposedParam, unsigned ArgIdx, unsigned TDF, + TemplateSpecCandidateSet *FailedTSC = nullptr); /// Attempt template argument deduction from an initializer list /// deemed to be an argument in a function call. @@ -4029,14 +4033,16 @@ static Sema::TemplateDeductionResult DeduceTemplateArgumentsFromCallArgument( QualType ParamType, Expr *Arg, TemplateDeductionInfo &Info, SmallVectorImpl &Deduced, SmallVectorImpl &OriginalCallArgs, - bool DecomposedParam, unsigned ArgIdx, unsigned TDF) { + bool DecomposedParam, unsigned ArgIdx, unsigned TDF, + TemplateSpecCandidateSet *FailedTSC) { QualType ArgType = Arg->getType(); QualType OrigParamType = ParamType; // If P is a reference type [...] // If P is a cv-qualified type [...] - if (AdjustFunctionParmAndArgTypesForDeduction( - S, TemplateParams, FirstInnerIndex, ParamType, ArgType, Arg, TDF)) + if (AdjustFunctionParmAndArgTypesForDeduction(S, TemplateParams, + FirstInnerIndex, ParamType, + ArgType, Arg, TDF, FailedTSC)) return Sema::TDK_Success; // If [...] the argument is a non-empty initializer list [...] @@ -4719,11 +4725,11 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type, /// should be specified in the 'Info' parameter. /// \param IgnoreConstraints Set if we should not fail if the deduced type does /// not satisfy the type-constraint in the auto type. -Sema::TemplateDeductionResult Sema::DeduceAutoType(TypeLoc Type, Expr *Init, - QualType &Result, - TemplateDeductionInfo &Info, - bool DependentDeduction, - bool IgnoreConstraints) { +Sema::TemplateDeductionResult +Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result, + TemplateDeductionInfo &Info, bool DependentDeduction, + bool IgnoreConstraints, + TemplateSpecCandidateSet *FailedTSC) { assert(DependentDeduction || Info.getDeducedDepth() == 0); if (Init->containsErrors()) return TDK_AlreadyDiagnosed; @@ -4837,7 +4843,8 @@ Sema::TemplateDeductionResult Sema::DeduceAutoType(TypeLoc Type, Expr *Init, "substituting template parameter for 'auto' failed"); if (auto TDK = DeduceTemplateArgumentsFromCallArgument( *this, TemplateParamsSt.get(), 0, FuncParam, Init, Info, Deduced, - OriginalCallArgs, /*Decomposed=*/false, /*ArgIdx=*/0, /*TDF=*/0)) + OriginalCallArgs, /*Decomposed=*/false, /*ArgIdx=*/0, /*TDF=*/0, + FailedTSC)) return DeductionFailed(TDK); } diff --git a/clang/test/SemaCXX/auto-type-from-cxx.cpp b/clang/test/SemaCXX/auto-type-from-cxx.cpp index 21620f73be7e9..5cd48991ffb7a 100644 --- a/clang/test/SemaCXX/auto-type-from-cxx.cpp +++ b/clang/test/SemaCXX/auto-type-from-cxx.cpp @@ -18,3 +18,21 @@ int d() { new __auto_type; // expected-error {{'__auto_type' not allowed in type allocated by 'new'}} } +namespace TestDeductionFail { + +template +void caller(T x) {x.fun();} // expected-note {{candidate template ignored: substitution failure [with T = TestDeductionFail::Abstract]: parameter type 'TestDeductionFail::Abstract' is an abstract class}} + +template +auto getCaller(){ + return caller; // expected-error {{cannot deduce return type 'auto' from returned value of type ''}} +} + +class Abstract{ + public: + void fun(); + virtual void vfun()=0; + void call(){getCaller()(*this);} // expected-note {{in instantiation of function template specialization 'TestDeductionFail::getCaller' requested here}} +}; + +} From e6830b6028ec5434ccf8dbebdd992918f67b1751 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Tue, 30 May 2023 21:06:50 -0700 Subject: [PATCH 197/704] [clang][modules] NFCI: Extract optionality out of `Module::{Header,DirectoryName}` Most users of `Module::Header` already assume its `Entry` is populated. Enforce this assumption in the type system and handle the only case where this is not the case by wrapping the whole struct in `std::optional`. Do the same for `Module::DirectoryName`. Depends on D151584. Reviewed By: benlangmuir Differential Revision: https://reviews.llvm.org/D151586 --- .../modularize/CoverageChecker.cpp | 20 +++++++++---------- .../modularize/ModularizeUtilities.cpp | 15 +++++++------- clang/include/clang/Basic/Module.h | 16 ++++++--------- clang/lib/Basic/Module.cpp | 12 +++++------ clang/lib/Frontend/FrontendAction.cpp | 18 +++++++++-------- clang/lib/Lex/ModuleMap.cpp | 4 ++-- clang/lib/Lex/PPLexerChange.cpp | 7 ++++--- clang/lib/Serialization/ASTReader.cpp | 9 +++++---- clang/lib/Serialization/ASTWriter.cpp | 9 +++++---- 9 files changed, 56 insertions(+), 54 deletions(-) diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp index 83c39e495c31a..d8445053872bf 100644 --- a/clang-tools-extra/modularize/CoverageChecker.cpp +++ b/clang-tools-extra/modularize/CoverageChecker.cpp @@ -207,25 +207,25 @@ void CoverageChecker::collectModuleHeaders() { // FIXME: Doesn't collect files from umbrella header. bool CoverageChecker::collectModuleHeaders(const Module &Mod) { - if (const FileEntry *UmbrellaHeader = - Mod.getUmbrellaHeaderAsWritten().Entry) { + if (std::optional UmbrellaHeader = + Mod.getUmbrellaHeaderAsWritten()) { // Collect umbrella header. - ModuleMapHeadersSet.insert(ModularizeUtilities::getCanonicalPath( - UmbrellaHeader->getName())); + ModuleMapHeadersSet.insert( + ModularizeUtilities::getCanonicalPath(UmbrellaHeader->Entry.getName())); // Preprocess umbrella header and collect the headers it references. - if (!collectUmbrellaHeaderHeaders(UmbrellaHeader->getName())) + if (!collectUmbrellaHeaderHeaders(UmbrellaHeader->Entry.getName())) return false; - } else if (const DirectoryEntry *UmbrellaDir = - Mod.getUmbrellaDirAsWritten().Entry) { + } else if (std::optional UmbrellaDir = + Mod.getUmbrellaDirAsWritten()) { // Collect headers in umbrella directory. - if (!collectUmbrellaHeaders(UmbrellaDir->getName())) + if (!collectUmbrellaHeaders(UmbrellaDir->Entry.getName())) return false; } for (auto &HeaderKind : Mod.Headers) for (auto &Header : HeaderKind) - ModuleMapHeadersSet.insert(ModularizeUtilities::getCanonicalPath( - Header.Entry->getName())); + ModuleMapHeadersSet.insert( + ModularizeUtilities::getCanonicalPath(Header.Entry.getName())); for (auto *Submodule : Mod.submodules()) collectModuleHeaders(*Submodule); diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp index 5b09c916606d9..3ef808d204c61 100644 --- a/clang-tools-extra/modularize/ModularizeUtilities.cpp +++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp @@ -348,19 +348,20 @@ bool ModularizeUtilities::collectModuleHeaders(const clang::Module &Mod) { for (auto *Submodule : Mod.submodules()) collectModuleHeaders(*Submodule); - if (const FileEntry *UmbrellaHeader = - Mod.getUmbrellaHeaderAsWritten().Entry) { - std::string HeaderPath = getCanonicalPath(UmbrellaHeader->getName()); + if (std::optional UmbrellaHeader = + Mod.getUmbrellaHeaderAsWritten()) { + std::string HeaderPath = getCanonicalPath(UmbrellaHeader->Entry.getName()); // Collect umbrella header. HeaderFileNames.push_back(HeaderPath); // FUTURE: When needed, umbrella header header collection goes here. - } else if (const DirectoryEntry *UmbrellaDir = - Mod.getUmbrellaDirAsWritten().Entry) { + } else if (std::optional UmbrellaDir = + Mod.getUmbrellaDirAsWritten()) { // If there normal headers, assume these are umbrellas and skip collection. if (Mod.Headers->size() == 0) { // Collect headers in umbrella directory. - if (!collectUmbrellaHeaders(UmbrellaDir->getName(), UmbrellaDependents)) + if (!collectUmbrellaHeaders(UmbrellaDir->Entry.getName(), + UmbrellaDependents)) return false; } } @@ -377,7 +378,7 @@ bool ModularizeUtilities::collectModuleHeaders(const clang::Module &Mod) { // Collect normal header. const clang::Module::Header &Header( Mod.Headers[clang::Module::HK_Normal][Index]); - std::string HeaderPath = getCanonicalPath(Header.Entry->getName()); + std::string HeaderPath = getCanonicalPath(Header.Entry.getName()); HeaderFileNames.push_back(HeaderPath); } diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h index 3ecab422bc42c..9625a682c3549 100644 --- a/clang/include/clang/Basic/Module.h +++ b/clang/include/clang/Basic/Module.h @@ -243,9 +243,7 @@ class alignas(8) Module { struct Header { std::string NameAsWritten; std::string PathRelativeToRootModuleDirectory; - OptionalFileEntryRefDegradesToFileEntryPtr Entry; - - explicit operator bool() { return Entry.has_value(); } + FileEntryRef Entry; }; /// Information about a directory name as found in the module map @@ -253,9 +251,7 @@ class alignas(8) Module { struct DirectoryName { std::string NameAsWritten; std::string PathRelativeToRootModuleDirectory; - OptionalDirectoryEntryRefDegradesToDirectoryEntryPtr Entry; - - explicit operator bool() { return Entry.has_value(); } + DirectoryEntryRef Entry; }; /// The headers that are part of this module. @@ -653,21 +649,21 @@ class alignas(8) Module { } /// Retrieve the umbrella directory as written. - DirectoryName getUmbrellaDirAsWritten() const { + std::optional getUmbrellaDirAsWritten() const { if (const auto *ME = Umbrella.dyn_cast()) return DirectoryName{UmbrellaAsWritten, UmbrellaRelativeToRootModuleDirectory, DirectoryEntryRef(*ME)}; - return DirectoryName{}; + return std::nullopt; } /// Retrieve the umbrella header as written. - Header getUmbrellaHeaderAsWritten() const { + std::optional
getUmbrellaHeaderAsWritten() const { if (const auto *ME = Umbrella.dyn_cast()) return Header{UmbrellaAsWritten, UmbrellaRelativeToRootModuleDirectory, FileEntryRef(*ME)}; - return Header{}; + return std::nullopt; } /// Get the effective umbrella directory for this module: either the one diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp index 3df376a32e53e..057fc77d0e993 100644 --- a/clang/lib/Basic/Module.cpp +++ b/clang/lib/Basic/Module.cpp @@ -483,15 +483,15 @@ void Module::print(raw_ostream &OS, unsigned Indent, bool Dump) const { OS << "\n"; } - if (Header H = getUmbrellaHeaderAsWritten()) { + if (std::optional
H = getUmbrellaHeaderAsWritten()) { OS.indent(Indent + 2); OS << "umbrella header \""; - OS.write_escaped(H.NameAsWritten); + OS.write_escaped(H->NameAsWritten); OS << "\"\n"; - } else if (DirectoryName D = getUmbrellaDirAsWritten()) { + } else if (std::optional D = getUmbrellaDirAsWritten()) { OS.indent(Indent + 2); OS << "umbrella \""; - OS.write_escaped(D.NameAsWritten); + OS.write_escaped(D->NameAsWritten); OS << "\"\n"; } @@ -523,8 +523,8 @@ void Module::print(raw_ostream &OS, unsigned Indent, bool Dump) const { OS.indent(Indent + 2); OS << K.Prefix << "header \""; OS.write_escaped(H.NameAsWritten); - OS << "\" { size " << H.Entry->getSize() - << " mtime " << H.Entry->getModificationTime() << " }\n"; + OS << "\" { size " << H.Entry.getSize() + << " mtime " << H.Entry.getModificationTime() << " }\n"; } } for (auto *Unresolved : {&UnresolvedHeaders, &MissingHeaders}) { diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index a8dcdb44b08df..c5893874e1d32 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -364,18 +364,19 @@ static std::error_code collectModuleHeaderIncludes( } // Note that Module->PrivateHeaders will not be a TopHeader. - if (Module::Header UmbrellaHeader = Module->getUmbrellaHeaderAsWritten()) { - Module->addTopHeader(UmbrellaHeader.Entry); + if (std::optional UmbrellaHeader = + Module->getUmbrellaHeaderAsWritten()) { + Module->addTopHeader(UmbrellaHeader->Entry); if (Module->Parent) // Include the umbrella header for submodules. - addHeaderInclude(UmbrellaHeader.PathRelativeToRootModuleDirectory, + addHeaderInclude(UmbrellaHeader->PathRelativeToRootModuleDirectory, Includes, LangOpts, Module->IsExternC); - } else if (Module::DirectoryName UmbrellaDir = + } else if (std::optional UmbrellaDir = Module->getUmbrellaDirAsWritten()) { // Add all of the headers we find in this subdirectory. std::error_code EC; SmallString<128> DirNative; - llvm::sys::path::native(UmbrellaDir.Entry->getName(), DirNative); + llvm::sys::path::native(UmbrellaDir->Entry.getName(), DirNative); llvm::vfs::FileSystem &FS = FileMgr.getVirtualFileSystem(); SmallVector< @@ -407,7 +408,7 @@ static std::error_code collectModuleHeaderIncludes( for (int I = 0; I != Dir.level() + 1; ++I, ++PathIt) Components.push_back(*PathIt); SmallString<128> RelativeHeader( - UmbrellaDir.PathRelativeToRootModuleDirectory); + UmbrellaDir->PathRelativeToRootModuleDirectory); for (auto It = Components.rbegin(), End = Components.rend(); It != End; ++It) llvm::sys::path::append(RelativeHeader, *It); @@ -553,8 +554,9 @@ getInputBufferForModule(CompilerInstance &CI, Module *M) { // Collect the set of #includes we need to build the module. SmallString<256> HeaderContents; std::error_code Err = std::error_code(); - if (Module::Header UmbrellaHeader = M->getUmbrellaHeaderAsWritten()) - addHeaderInclude(UmbrellaHeader.PathRelativeToRootModuleDirectory, + if (std::optional UmbrellaHeader = + M->getUmbrellaHeaderAsWritten()) + addHeaderInclude(UmbrellaHeader->PathRelativeToRootModuleDirectory, HeaderContents, CI.getLangOpts(), M->IsExternC); Err = collectModuleHeaderIncludes( CI.getLangOpts(), FileMgr, CI.getDiagnostics(), diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index 6808fdfdaf4f9..bfd4890e3a97b 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -1289,7 +1289,7 @@ void ModuleMap::addHeader(Module *Mod, Module::Header Header, // Notify callbacks that we just added a new header. for (const auto &Cb : Callbacks) - Cb->moduleMapAddHeader(Header.Entry->getName()); + Cb->moduleMapAddHeader(Header.Entry.getName()); } OptionalFileEntryRef @@ -2541,7 +2541,7 @@ void ModuleMapParser::parseUmbrellaDirDecl(SourceLocation UmbrellaLoc) { for (llvm::vfs::recursive_directory_iterator I(FS, Dir->getName(), EC), E; I != E && !EC; I.increment(EC)) { if (auto FE = SourceMgr.getFileManager().getOptionalFileRef(I->path())) { - Module::Header Header = {"", std::string(I->path()), FE}; + Module::Header Header = {"", std::string(I->path()), *FE}; Headers.push_back(std::move(Header)); } } diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp index 4103cfe178b29..e2dc532e6b708 100644 --- a/clang/lib/Lex/PPLexerChange.cpp +++ b/clang/lib/Lex/PPLexerChange.cpp @@ -289,9 +289,10 @@ static void collectAllSubModulesWithUmbrellaHeader( } void Preprocessor::diagnoseMissingHeaderInUmbrellaDir(const Module &Mod) { - Module::Header UmbrellaHeader = Mod.getUmbrellaHeaderAsWritten(); - assert(UmbrellaHeader.Entry && "Module must use umbrella header"); - const FileID &File = SourceMgr.translateFile(UmbrellaHeader.Entry); + std::optional UmbrellaHeader = + Mod.getUmbrellaHeaderAsWritten(); + assert(UmbrellaHeader && "Module must use umbrella header"); + const FileID &File = SourceMgr.translateFile(UmbrellaHeader->Entry); SourceLocation ExpectedHeadersLoc = SourceMgr.getLocForEndOfFile(File); if (getDiagnostics().isIgnored(diag::warn_uncovered_module_header, ExpectedHeadersLoc)) diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index f6251fb03ccf2..a0ccc5aa4a741 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1973,10 +1973,11 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d, std::string Filename = std::string(key.Filename); if (key.Imported) Reader.ResolveImportedPath(M, Filename); - // FIXME: NameAsWritten - Module::Header H = {std::string(key.Filename), "", - FileMgr.getOptionalFileRef(Filename)}; - ModMap.addHeader(Mod, H, HeaderRole, /*Imported*/true); + if (auto FE = FileMgr.getOptionalFileRef(Filename)) { + // FIXME: NameAsWritten + Module::Header H = {std::string(key.Filename), "", *FE}; + ModMap.addHeader(Mod, H, HeaderRole, /*Imported=*/true); + } HFI.isModuleHeader |= ModuleMap::isModular(HeaderRole); } diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 96b087ed57933..21528f8140eb1 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -2846,15 +2846,16 @@ void ASTWriter::WriteSubmodules(Module *WritingModule) { } // Emit the umbrella header, if there is one. - if (Module::Header UmbrellaHeader = Mod->getUmbrellaHeaderAsWritten()) { + if (std::optional UmbrellaHeader = + Mod->getUmbrellaHeaderAsWritten()) { RecordData::value_type Record[] = {SUBMODULE_UMBRELLA_HEADER}; Stream.EmitRecordWithBlob(UmbrellaAbbrev, Record, - UmbrellaHeader.NameAsWritten); - } else if (Module::DirectoryName UmbrellaDir = + UmbrellaHeader->NameAsWritten); + } else if (std::optional UmbrellaDir = Mod->getUmbrellaDirAsWritten()) { RecordData::value_type Record[] = {SUBMODULE_UMBRELLA_DIR}; Stream.EmitRecordWithBlob(UmbrellaDirAbbrev, Record, - UmbrellaDir.NameAsWritten); + UmbrellaDir->NameAsWritten); } // Emit the headers. From 0a4ba485c960f76edd6fd8a5fec8bf4350710025 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 30 May 2023 21:27:36 -0700 Subject: [PATCH 198/704] [RISCV] Strengthen some SDTypeProfiles to reduce isel table size. --- .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 69 +++++++++++-------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index b83ae5ff7cddb..c14b7ddaa00f6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -373,35 +373,48 @@ def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL", SDTCVecEltisVT<2, i1>, SDTCisVT<3, XLenVT>]>>; -def SDT_RISCVVWBinOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, - SDTCisSameNumEltsAs<0, 1>, - SDTCisSameAs<1, 2>, - SDTCisSameAs<0, 3>, - SDTCisSameNumEltsAs<1, 4>, - SDTCVecEltisVT<4, i1>, - SDTCisVT<5, XLenVT>]>; -def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; -def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; -def riscv_vwmulsu_vl : SDNode<"RISCVISD::VWMULSU_VL", SDT_RISCVVWBinOp_VL>; -def riscv_vwadd_vl : SDNode<"RISCVISD::VWADD_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; -def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; -def riscv_vwsub_vl : SDNode<"RISCVISD::VWSUB_VL", SDT_RISCVVWBinOp_VL, []>; -def riscv_vwsubu_vl : SDNode<"RISCVISD::VWSUBU_VL", SDT_RISCVVWBinOp_VL, []>; - -def riscv_vfwmul_vl : SDNode<"RISCVISD::VFWMUL_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>; - -def SDT_RISCVVNBinOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, - SDTCisSameNumEltsAs<0, 1>, - SDTCisOpSmallerThanOp<0, 1>, - SDTCisSameAs<0, 2>, - SDTCisSameAs<0, 3>, - SDTCisSameNumEltsAs<0, 4>, - SDTCVecEltisVT<4, i1>, - SDTCisVT<5, XLenVT>]>; -def riscv_vnsrl_vl : SDNode<"RISCVISD::VNSRL_VL", SDT_RISCVVNBinOp_VL>; - -def SDT_RISCVVWBinOpW_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, +def SDT_RISCVVWIntBinOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisInt<0>, + SDTCisInt<1>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisSameAs<1, 2>, + SDTCisSameAs<0, 3>, + SDTCisSameNumEltsAs<1, 4>, + SDTCVecEltisVT<4, i1>, + SDTCisVT<5, XLenVT>]>; +def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWIntBinOp_VL, [SDNPCommutative]>; +def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWIntBinOp_VL, [SDNPCommutative]>; +def riscv_vwmulsu_vl : SDNode<"RISCVISD::VWMULSU_VL", SDT_RISCVVWIntBinOp_VL>; +def riscv_vwadd_vl : SDNode<"RISCVISD::VWADD_VL", SDT_RISCVVWIntBinOp_VL, [SDNPCommutative]>; +def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWIntBinOp_VL, [SDNPCommutative]>; +def riscv_vwsub_vl : SDNode<"RISCVISD::VWSUB_VL", SDT_RISCVVWIntBinOp_VL, []>; +def riscv_vwsubu_vl : SDNode<"RISCVISD::VWSUBU_VL", SDT_RISCVVWIntBinOp_VL, []>; + +def SDT_RISCVVWFPBinOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisFP<0>, + SDTCisFP<1>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisSameAs<1, 2>, + SDTCisSameAs<0, 3>, + SDTCisSameNumEltsAs<1, 4>, + SDTCVecEltisVT<4, i1>, + SDTCisVT<5, XLenVT>]>; +def riscv_vfwmul_vl : SDNode<"RISCVISD::VFWMUL_VL", SDT_RISCVVWFPBinOp_VL, [SDNPCommutative]>; + +def SDT_RISCVVNIntBinOp_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisInt<0>, + SDTCisInt<1>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisOpSmallerThanOp<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisSameNumEltsAs<0, 4>, + SDTCVecEltisVT<4, i1>, + SDTCisVT<5, XLenVT>]>; +def riscv_vnsrl_vl : SDNode<"RISCVISD::VNSRL_VL", SDT_RISCVVNIntBinOp_VL>; + +def SDT_RISCVVWBinOpW_VL : SDTypeProfile<1, 5, [SDTCisVec<0>, SDTCisInt<0>, SDTCisSameAs<0, 1>, + SDTCisInt<2>, SDTCisSameNumEltsAs<1, 2>, SDTCisOpSmallerThanOp<2, 1>, SDTCisSameAs<0, 3>, From 9bd3ff8bf9e05e7bf95a4a068aa3b50813a93975 Mon Sep 17 00:00:00 2001 From: Shubham Sandeep Rastogi Date: Fri, 26 May 2023 12:05:09 -0700 Subject: [PATCH 199/704] Fix -u option in dsymutil, to not emit an extra DW_LNE_set_address if the original line table was empty With dsymutil's -u option, only the accelerator tables should be updated, but with https://reviews.llvm.org/D150554 the -u option will still re-generate the line table. If the line table was empty, that is, it was a dummy line table, with no entries in it, dsymutil will always generate a line table with a DW_LNE_end_sequence, a funky side effect of this is that when the line table is re-generated, it will always emit a DW_LNE_set_address first, which will change the line table total size. This patch addresses this by making sure that if all the line table has in it is a DW_LNE_end_sequence, it is the same as a dummy entry. Differential Revision: https://reviews.llvm.org/D151579 --- llvm/lib/DWARFLinker/DWARFLinker.cpp | 4 + .../tools/dsymutil/ARM/fat-dylib-update.test | 155 ++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 llvm/test/tools/dsymutil/ARM/fat-dylib-update.test diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index af2e0db74066c..363cff430a662 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -2082,6 +2082,10 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { // Set Line Table Rows. if (Linker.Options.Update) { LineTable.Rows = LT->Rows; + // If all the line table contains is a DW_LNE_end_sequence, clear the line + // table rows, it will be inserted again in the DWARFStreamer. + if (LineTable.Rows.size() == 1 && LineTable.Rows[0].EndSequence) + LineTable.Rows.clear(); LineTable.Sequences = LT->Sequences; } else { diff --git a/llvm/test/tools/dsymutil/ARM/fat-dylib-update.test b/llvm/test/tools/dsymutil/ARM/fat-dylib-update.test new file mode 100644 index 0000000000000..fbe8f29953cd7 --- /dev/null +++ b/llvm/test/tools/dsymutil/ARM/fat-dylib-update.test @@ -0,0 +1,155 @@ +# REQUIRES: object-emission +# RUN: dsymutil -oso-prepend-path %p/.. %p/../Inputs/fat-test.arm.dylib -o %t.dSYM +# RUN: llvm-dwarfdump -a -v %t.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s +# RUN: dsymutil -u %t.dSYM +# RUN: llvm-dwarfdump -a -v %t.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s +# RUN: dsymutil -u %t.dSYM -o %t1.dSYM +# RUN: llvm-dwarfdump -a -v %t1.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s + +CHECK: /Contents/Resources/DWARF/fat-test.arm.dylib(armv7): file format Mach-O arm + +CHECK: .debug_info contents: +CHECK: Compile Unit: length = 0x00000034, format = DWARF32, version = 0x0002, abbr_offset = 0x0000, addr_size = 0x04 (next unit at 0x00000038) +CHECK: DW_TAG_compile_unit [1] * +CHECK: DW_AT_producer [DW_FORM_strp] ( .debug_str[0x00000001] = "clang version 3.8.0 (trunk 243776)") +CHECK: DW_AT_language [DW_FORM_data2] (DW_LANG_C99) +CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000024] = "fat-test.c") +CHECK: DW_AT_stmt_list [DW_FORM_data4] (0x00000000) +CHECK: DW_AT_comp_dir [DW_FORM_strp] ( .debug_str[0x0000002f] = "/Inputs") +CHECK: DW_TAG_variable [2] +CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000037] = "armv7_var") +CHECK: DW_AT_type [DW_FORM_ref4] (cu + 0x0030 => {0x00000030} +CHECK: DW_AT_external [DW_FORM_flag] (0x01) +CHECK: DW_AT_decl_file [DW_FORM_data1] ("/Inputs/fat-test.c") +CHECK: DW_AT_decl_line [DW_FORM_data1] (23) +CHECK: DW_AT_location [DW_FORM_block1] (DW_OP_addr 0x1000) +CHECK: DW_TAG_base_type [3] +CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000041] = "int") +CHECK: DW_AT_encoding [DW_FORM_data1] (DW_ATE_signed) +CHECK: DW_AT_byte_size [DW_FORM_data1] (0x04) +CHECK: NULL + + +CHECK: .debug_line contents: +CHECK: Line table prologue: +CHECK: total_length: 0x0000002a +CHECK: version: 2 +CHECK: prologue_length: 0x00000021 +CHECK: min_inst_length: 1 +CHECK: default_is_stmt: 1 +CHECK: line_base: -5 +CHECK: line_range: 14 +CHECK: opcode_base: 13 + +CHECK: .apple_names contents: +CHECK: String: 0x00000037 "armv7_var" +CHECK-NEXT: Data 0 [ +CHECK-NEXT: Atom[0]: 0x0000001e +CHECK-NEXT: ] + +CHECK: .apple_types contents: +CHECK: String: 0x00000041 "int" +CHECK-NEXT: Data 0 [ +CHECK-NEXT: Atom[0]: 0x00000030 +CHECK-NEXT: Atom[1]: 0x0024 +CHECK-NEXT: Atom[2]: 0x00 +CHECK-NEXT: Atom[3]: 0x0c3a28a4 +CHECK-NEXT: ] + +CHECK: /Contents/Resources/DWARF/fat-test.arm.dylib(armv7s): file format Mach-O arm + +CHECK: .debug_info contents: +CHECK: Compile Unit: length = 0x00000034, format = DWARF32, version = 0x0002, abbr_offset = 0x0000, addr_size = 0x04 (next unit at 0x00000038) +CHECK: DW_TAG_compile_unit [1] * +CHECK: DW_AT_producer [DW_FORM_strp] ( .debug_str[0x00000001] = "clang version 3.8.0 (trunk 243776)") +CHECK: DW_AT_language [DW_FORM_data2] (DW_LANG_C99) +CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000024] = "fat-test.c") +CHECK: DW_AT_stmt_list [DW_FORM_data4] (0x00000000) +CHECK: DW_AT_comp_dir [DW_FORM_strp] ( .debug_str[0x0000002f] = "/Inputs") +CHECK: DW_TAG_variable [2] +CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000037] = "armv7s_var") +CHECK: DW_AT_type [DW_FORM_ref4] (cu + 0x0030 => {0x00000030} +CHECK: DW_AT_external [DW_FORM_flag] (0x01) +CHECK: DW_AT_decl_file [DW_FORM_data1] ("/Inputs/fat-test.c") +CHECK: DW_AT_decl_line [DW_FORM_data1] (21) +CHECK: DW_AT_location [DW_FORM_block1] (DW_OP_addr 0x1000) +CHECK: DW_TAG_base_type [3] +CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000042] = "int") +CHECK: DW_AT_encoding [DW_FORM_data1] (DW_ATE_signed) +CHECK: DW_AT_byte_size [DW_FORM_data1] (0x04) +CHECK: NULL + +CHECK: .debug_line contents: +CHECK: Line table prologue: +CHECK: total_length: 0x0000002a +CHECK: version: 2 +CHECK: prologue_length: 0x00000021 +CHECK: min_inst_length: 1 +CHECK: default_is_stmt: 1 +CHECK: line_base: -5 +CHECK: line_range: 14 +CHECK: opcode_base: 13 + +CHECK: .apple_names contents: +CHECK: String: 0x00000037 "armv7s_var" +CHECK-NEXT: Data 0 [ +CHECK-NEXT: Atom[0]: 0x0000001e +CHECK-NEXT: ] + +CHECK: .apple_types contents: +CHECK: String: 0x00000042 "int" +CHECK-NEXT: Data 0 [ +CHECK-NEXT: Atom[0]: 0x00000030 +CHECK-NEXT: Atom[1]: 0x0024 +CHECK-NEXT: Atom[2]: 0x00 +CHECK-NEXT: Atom[3]: 0x0c3a28a4 +CHECK-NEXT: ] + +CHECK: /Contents/Resources/DWARF/fat-test.arm.dylib(arm64): file format Mach-O arm64 + +CHECK: .debug_info contents: +CHECK: Compile Unit: length = 0x00000038, format = DWARF32, version = 0x0002, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x0000003c) +CHECK: DW_TAG_compile_unit [1] * +CHECK: DW_AT_producer [DW_FORM_strp] ( .debug_str[0x00000001] = "clang version 3.8.0 (trunk 243776)") +CHECK: DW_AT_language [DW_FORM_data2] (DW_LANG_C99) +CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000024] = "fat-test.c") +CHECK: DW_AT_stmt_list [DW_FORM_data4] (0x00000000) +CHECK: DW_AT_comp_dir [DW_FORM_strp] ( .debug_str[0x0000002f] = "/Inputs") +CHECK: DW_TAG_variable [2] +CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000037] = "arm64_var") +CHECK: DW_AT_type [DW_FORM_ref4] (cu + 0x0034 => {0x00000034} +CHECK: DW_AT_external [DW_FORM_flag] (0x01) +CHECK: DW_AT_decl_file [DW_FORM_data1] ("/Inputs/fat-test.c") +CHECK: DW_AT_decl_line [DW_FORM_data1] (25) +CHECK: DW_AT_location [DW_FORM_block1] (DW_OP_addr 0x4000) +CHECK: DW_TAG_base_type [3] +CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000041] = "int") +CHECK: DW_AT_encoding [DW_FORM_data1] (DW_ATE_signed) +CHECK: DW_AT_byte_size [DW_FORM_data1] (0x04) +CHECK: NULL + +CHECK: .debug_line contents: +CHECK: Line table prologue: +CHECK: total_length: 0x0000002a +CHECK: version: 2 +CHECK: prologue_length: 0x00000021 +CHECK: min_inst_length: 1 +CHECK: default_is_stmt: 1 +CHECK: line_base: -5 +CHECK: line_range: 14 +CHECK: opcode_base: 13 + +CHECK: .apple_names contents: +CHECK: String: 0x00000037 "arm64_var" +CHECK-NEXT: Data 0 [ +CHECK-NEXT: Atom[0]: 0x0000001e +CHECK-NEXT: ] + +CHECK: .apple_types contents: +CHECK: String: 0x00000041 "int" +CHECK-NEXT: Data 0 [ +CHECK-NEXT: Atom[0]: 0x00000034 +CHECK-NEXT: Atom[1]: 0x0024 +CHECK-NEXT: Atom[2]: 0x00 +CHECK-NEXT: Atom[3]: 0x0c3a28a4 +CHECK-NEXT: ] From 3473f728b36e21d322f141e576377c20a02c9aad Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Tue, 30 May 2023 21:24:51 -0700 Subject: [PATCH 200/704] [clang][lex] NFCI: Use DirectoryEntryRef in HeaderSearch::LookupFile This patch changes the argument type to `HeaderSearch::LookupFile()` from `const DirectoryEntry *` to `DirectoryEntryRef` in order to remove some calls to the deprecated `DirectoryEntry::getName()`. Depends on D127660. Reviewed By: bnbarham, benlangmuir Differential Revision: https://reviews.llvm.org/D127663 --- clang/include/clang/Lex/HeaderSearch.h | 2 +- clang/lib/Frontend/FrontendAction.cpp | 8 +++----- clang/lib/Lex/HeaderSearch.cpp | 10 +++++----- clang/lib/Lex/PPDirectives.cpp | 17 +++++++++-------- clang/test/Modules/Inputs/filename/a.h | 1 - clang/test/Modules/Inputs/filename/module.map | 3 --- clang/test/Modules/filename.cpp | 19 ++++++++++++++----- 7 files changed, 32 insertions(+), 28 deletions(-) delete mode 100644 clang/test/Modules/Inputs/filename/a.h delete mode 100644 clang/test/Modules/Inputs/filename/module.map diff --git a/clang/include/clang/Lex/HeaderSearch.h b/clang/include/clang/Lex/HeaderSearch.h index 2a4e046be46fd..d3ee4963fced9 100644 --- a/clang/include/clang/Lex/HeaderSearch.h +++ b/clang/include/clang/Lex/HeaderSearch.h @@ -482,7 +482,7 @@ class HeaderSearch { OptionalFileEntryRef LookupFile( StringRef Filename, SourceLocation IncludeLoc, bool isAngled, ConstSearchDirIterator FromDir, ConstSearchDirIterator *CurDir, - ArrayRef> Includers, + ArrayRef> Includers, SmallVectorImpl *SearchPath, SmallVectorImpl *RelativePath, Module *RequestingModule, ModuleMap::KnownHeader *SuggestedModule, bool *IsMapped, bool *IsFrameworkFound, bool SkipCache = false, diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp index c5893874e1d32..cfac2f8c4e5a6 100644 --- a/clang/lib/Frontend/FrontendAction.cpp +++ b/clang/lib/Frontend/FrontendAction.cpp @@ -825,11 +825,9 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI, "trying to build a header unit without a Pre-processor?"); HeaderSearch &HS = CI.getPreprocessor().getHeaderSearchInfo(); // Relative searches begin from CWD. - const DirectoryEntry *Dir = nullptr; - if (auto DirOrErr = CI.getFileManager().getDirectory(".")) - Dir = *DirOrErr; - SmallVector, 1> CWD; - CWD.push_back({nullptr, Dir}); + auto Dir = CI.getFileManager().getOptionalDirectoryRef("."); + SmallVector, 1> CWD; + CWD.push_back({nullptr, *Dir}); OptionalFileEntryRef FE = HS.LookupFile(FileName, SourceLocation(), /*Angled*/ Input.getKind().getHeaderUnitKind() == diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp index 7df1ca16f67ce..3366f158fd4f7 100644 --- a/clang/lib/Lex/HeaderSearch.cpp +++ b/clang/lib/Lex/HeaderSearch.cpp @@ -863,7 +863,7 @@ diagnoseFrameworkInclude(DiagnosticsEngine &Diags, SourceLocation IncludeLoc, OptionalFileEntryRef HeaderSearch::LookupFile( StringRef Filename, SourceLocation IncludeLoc, bool isAngled, ConstSearchDirIterator FromDir, ConstSearchDirIterator *CurDirArg, - ArrayRef> Includers, + ArrayRef> Includers, SmallVectorImpl *SearchPath, SmallVectorImpl *RelativePath, Module *RequestingModule, ModuleMap::KnownHeader *SuggestedModule, bool *IsMapped, bool *IsFrameworkFound, bool SkipCache, @@ -918,7 +918,7 @@ OptionalFileEntryRef HeaderSearch::LookupFile( // Concatenate the requested file onto the directory. // FIXME: Portability. Filename concatenation should be in sys::Path. - TmpDir = IncluderAndDir.second->getName(); + TmpDir = IncluderAndDir.second.getName(); TmpDir.push_back('/'); TmpDir.append(Filename.begin(), Filename.end()); @@ -957,7 +957,7 @@ OptionalFileEntryRef HeaderSearch::LookupFile( ToHFI.Framework = Framework; if (SearchPath) { - StringRef SearchPathRef(IncluderAndDir.second->getName()); + StringRef SearchPathRef(IncluderAndDir.second.getName()); SearchPath->clear(); SearchPath->append(SearchPathRef.begin(), SearchPathRef.end()); } @@ -967,7 +967,7 @@ OptionalFileEntryRef HeaderSearch::LookupFile( } if (First) { diagnoseFrameworkInclude(Diags, IncludeLoc, - IncluderAndDir.second->getName(), Filename, + IncluderAndDir.second.getName(), Filename, &FE->getFileEntry()); return FE; } @@ -1122,7 +1122,7 @@ OptionalFileEntryRef HeaderSearch::LookupFile( bool FoundByHeaderMap = !IsMapped ? false : *IsMapped; if (!Includers.empty()) diagnoseFrameworkInclude( - Diags, IncludeLoc, Includers.front().second->getName(), Filename, + Diags, IncludeLoc, Includers.front().second.getName(), Filename, &File->getFileEntry(), isAngled, FoundByHeaderMap); // Remember this location for the next lookup we do. diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 1a5398e3adea6..b3ce92f1699da 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -935,12 +935,11 @@ OptionalFileEntryRef Preprocessor::LookupFile( // If the header lookup mechanism may be relative to the current inclusion // stack, record the parent #includes. - SmallVector, 16> - Includers; + SmallVector, 16> Includers; bool BuildSystemModule = false; if (!FromDir && !FromFile) { FileID FID = getCurrentFileLexer()->getFileID(); - const FileEntry *FileEnt = SourceMgr.getFileEntryForID(FID); + OptionalFileEntryRef FileEnt = SourceMgr.getFileEntryRefForID(FID); // If there is no file entry associated with this file, it must be the // predefines buffer or the module includes buffer. Any other file is not @@ -958,11 +957,13 @@ OptionalFileEntryRef Preprocessor::LookupFile( if (FID == SourceMgr.getMainFileID() && MainFileDir) { Includers.push_back(std::make_pair(nullptr, *MainFileDir)); BuildSystemModule = getCurrentModule()->IsSystem; - } else if ((FileEnt = - SourceMgr.getFileEntryForID(SourceMgr.getMainFileID()))) - Includers.push_back(std::make_pair(FileEnt, *FileMgr.getDirectory("."))); + } else if ((FileEnt = SourceMgr.getFileEntryRefForID( + SourceMgr.getMainFileID()))) { + auto CWD = FileMgr.getOptionalDirectoryRef("."); + Includers.push_back(std::make_pair(*FileEnt, *CWD)); + } } else { - Includers.push_back(std::make_pair(FileEnt, FileEnt->getDir())); + Includers.push_back(std::make_pair(*FileEnt, FileEnt->getDir())); } // MSVC searches the current include stack from top to bottom for @@ -972,7 +973,7 @@ OptionalFileEntryRef Preprocessor::LookupFile( for (IncludeStackInfo &ISEntry : llvm::reverse(IncludeMacroStack)) { if (IsFileLexer(ISEntry)) if ((FileEnt = ISEntry.ThePPLexer->getFileEntry())) - Includers.push_back(std::make_pair(FileEnt, FileEnt->getDir())); + Includers.push_back(std::make_pair(*FileEnt, FileEnt->getDir())); } } } diff --git a/clang/test/Modules/Inputs/filename/a.h b/clang/test/Modules/Inputs/filename/a.h deleted file mode 100644 index 8f896a9ba8f41..0000000000000 --- a/clang/test/Modules/Inputs/filename/a.h +++ /dev/null @@ -1 +0,0 @@ -const char *p = __FILE__; diff --git a/clang/test/Modules/Inputs/filename/module.map b/clang/test/Modules/Inputs/filename/module.map deleted file mode 100644 index ff164ad7bac8e..0000000000000 --- a/clang/test/Modules/Inputs/filename/module.map +++ /dev/null @@ -1,3 +0,0 @@ -module "A" { - header "a.h" -} diff --git a/clang/test/Modules/filename.cpp b/clang/test/Modules/filename.cpp index e2b5ad141891f..7c42a7eddee38 100644 --- a/clang/test/Modules/filename.cpp +++ b/clang/test/Modules/filename.cpp @@ -1,8 +1,17 @@ -// RUN: cd %S -// RUN: %clang_cc1 -I. -fmodule-name=A -fmodule-map-file=%S/Inputs/filename/module.map %s -E | FileCheck %s +// RUN: rm -rf %t +// RUN: split-file %s %t -#include "Inputs/filename/a.h" +//--- include/a.h +const char *p = __FILE__; +//--- include/module.modulemap +module "A" { header "a.h" } +//--- src/tu.cpp +#include "a.h" + +// RUN: cd %t +// RUN: %clang_cc1 -I ./include -fmodule-name=A -fmodule-map-file=%t/include/module.modulemap %t/src/tu.cpp -E | FileCheck %s // Make sure that headers that are referenced by module maps have __FILE__ -// reflect the include path they were found with. -// CHECK: const char *p = "./Inputs/filename/a.h" +// reflect the include path they were found with. (We make sure they cannot be +// found relative to the includer.) +// CHECK: const char *p = "./include{{/|\\\\}}a.h" From a5bf4860eaee23c5bb7bd945516cd4d9f1873d5d Mon Sep 17 00:00:00 2001 From: csmoe Date: Tue, 30 May 2023 21:43:11 -0700 Subject: [PATCH 201/704] print user provide value in tabstop diagnostic github issue: https://github.com/llvm/llvm-project/issues/62912 Reviewed By: jansvoboda11 Differential Revision: https://reviews.llvm.org/D151429 --- clang/lib/Frontend/CompilerInvocation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index a1d836feea9d5..f74bca3b33b1d 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -2394,9 +2394,9 @@ bool clang::ParseDiagnosticArgs(DiagnosticOptions &Opts, ArgList &Args, DiagMask = DiagnosticLevelMask::All; Opts.setVerifyIgnoreUnexpected(DiagMask); if (Opts.TabStop == 0 || Opts.TabStop > DiagnosticOptions::MaxTabStop) { - Opts.TabStop = DiagnosticOptions::DefaultTabStop; Diags->Report(diag::warn_ignoring_ftabstop_value) << Opts.TabStop << DiagnosticOptions::DefaultTabStop; + Opts.TabStop = DiagnosticOptions::DefaultTabStop; } addDiagnosticArgs(Args, OPT_W_Group, OPT_W_value_Group, Opts.Warnings); From 1e26c6ab05fd5b8a6c528ea9a325287af6440873 Mon Sep 17 00:00:00 2001 From: Shubham Sandeep Rastogi Date: Tue, 30 May 2023 21:46:10 -0700 Subject: [PATCH 202/704] Revert "Fix -u option in dsymutil, to not emit an extra DW_LNE_set_address if the original line table was empty" This reverts commit 9bd3ff8bf9e05e7bf95a4a068aa3b50813a93975. Reverting because of test failures: TEST 'LLVM :: tools/dsymutil/ARM/fat-dylib-update.test' FAILED Command Output (stderr): -- + : 'RUN: at line 2' + /b/ml-opt-rel-x86-64-b1/build/bin/dsymutil -oso-prepend-path /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/tools/dsymutil/ARM/.. /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/tools/dsymutil/ARM/../Inputs/fat-test.arm.dylib -o /b/ml-opt-rel-x86-64-b1/build/test/tools/dsymutil/ARM/Output/fat-dylib-update.test.tmp.dSYM warning: /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/tools/dsymutil/ARM/../Inputs/fat-test.arm.o: timestamp mismatch between object file (2023-05-24 23:42:39.442778779) and debug map (2015-08-05 21:31:26.000000000) warning: /b/ml-opt-rel-x86-64-b1/llvm-project/llvm/test/tools/dsymutil/ARM/../Inputs/fat-test.arm.o: timestamp mismatch between object file (2023-05-24 23:42:39.442778779) and debug map (2015-08-05 21:31:26.000000000) error: lipo: No such file or directory --- llvm/lib/DWARFLinker/DWARFLinker.cpp | 4 - .../tools/dsymutil/ARM/fat-dylib-update.test | 155 ------------------ 2 files changed, 159 deletions(-) delete mode 100644 llvm/test/tools/dsymutil/ARM/fat-dylib-update.test diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index 363cff430a662..af2e0db74066c 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -2082,10 +2082,6 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { // Set Line Table Rows. if (Linker.Options.Update) { LineTable.Rows = LT->Rows; - // If all the line table contains is a DW_LNE_end_sequence, clear the line - // table rows, it will be inserted again in the DWARFStreamer. - if (LineTable.Rows.size() == 1 && LineTable.Rows[0].EndSequence) - LineTable.Rows.clear(); LineTable.Sequences = LT->Sequences; } else { diff --git a/llvm/test/tools/dsymutil/ARM/fat-dylib-update.test b/llvm/test/tools/dsymutil/ARM/fat-dylib-update.test deleted file mode 100644 index fbe8f29953cd7..0000000000000 --- a/llvm/test/tools/dsymutil/ARM/fat-dylib-update.test +++ /dev/null @@ -1,155 +0,0 @@ -# REQUIRES: object-emission -# RUN: dsymutil -oso-prepend-path %p/.. %p/../Inputs/fat-test.arm.dylib -o %t.dSYM -# RUN: llvm-dwarfdump -a -v %t.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s -# RUN: dsymutil -u %t.dSYM -# RUN: llvm-dwarfdump -a -v %t.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s -# RUN: dsymutil -u %t.dSYM -o %t1.dSYM -# RUN: llvm-dwarfdump -a -v %t1.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s - -CHECK: /Contents/Resources/DWARF/fat-test.arm.dylib(armv7): file format Mach-O arm - -CHECK: .debug_info contents: -CHECK: Compile Unit: length = 0x00000034, format = DWARF32, version = 0x0002, abbr_offset = 0x0000, addr_size = 0x04 (next unit at 0x00000038) -CHECK: DW_TAG_compile_unit [1] * -CHECK: DW_AT_producer [DW_FORM_strp] ( .debug_str[0x00000001] = "clang version 3.8.0 (trunk 243776)") -CHECK: DW_AT_language [DW_FORM_data2] (DW_LANG_C99) -CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000024] = "fat-test.c") -CHECK: DW_AT_stmt_list [DW_FORM_data4] (0x00000000) -CHECK: DW_AT_comp_dir [DW_FORM_strp] ( .debug_str[0x0000002f] = "/Inputs") -CHECK: DW_TAG_variable [2] -CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000037] = "armv7_var") -CHECK: DW_AT_type [DW_FORM_ref4] (cu + 0x0030 => {0x00000030} -CHECK: DW_AT_external [DW_FORM_flag] (0x01) -CHECK: DW_AT_decl_file [DW_FORM_data1] ("/Inputs/fat-test.c") -CHECK: DW_AT_decl_line [DW_FORM_data1] (23) -CHECK: DW_AT_location [DW_FORM_block1] (DW_OP_addr 0x1000) -CHECK: DW_TAG_base_type [3] -CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000041] = "int") -CHECK: DW_AT_encoding [DW_FORM_data1] (DW_ATE_signed) -CHECK: DW_AT_byte_size [DW_FORM_data1] (0x04) -CHECK: NULL - - -CHECK: .debug_line contents: -CHECK: Line table prologue: -CHECK: total_length: 0x0000002a -CHECK: version: 2 -CHECK: prologue_length: 0x00000021 -CHECK: min_inst_length: 1 -CHECK: default_is_stmt: 1 -CHECK: line_base: -5 -CHECK: line_range: 14 -CHECK: opcode_base: 13 - -CHECK: .apple_names contents: -CHECK: String: 0x00000037 "armv7_var" -CHECK-NEXT: Data 0 [ -CHECK-NEXT: Atom[0]: 0x0000001e -CHECK-NEXT: ] - -CHECK: .apple_types contents: -CHECK: String: 0x00000041 "int" -CHECK-NEXT: Data 0 [ -CHECK-NEXT: Atom[0]: 0x00000030 -CHECK-NEXT: Atom[1]: 0x0024 -CHECK-NEXT: Atom[2]: 0x00 -CHECK-NEXT: Atom[3]: 0x0c3a28a4 -CHECK-NEXT: ] - -CHECK: /Contents/Resources/DWARF/fat-test.arm.dylib(armv7s): file format Mach-O arm - -CHECK: .debug_info contents: -CHECK: Compile Unit: length = 0x00000034, format = DWARF32, version = 0x0002, abbr_offset = 0x0000, addr_size = 0x04 (next unit at 0x00000038) -CHECK: DW_TAG_compile_unit [1] * -CHECK: DW_AT_producer [DW_FORM_strp] ( .debug_str[0x00000001] = "clang version 3.8.0 (trunk 243776)") -CHECK: DW_AT_language [DW_FORM_data2] (DW_LANG_C99) -CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000024] = "fat-test.c") -CHECK: DW_AT_stmt_list [DW_FORM_data4] (0x00000000) -CHECK: DW_AT_comp_dir [DW_FORM_strp] ( .debug_str[0x0000002f] = "/Inputs") -CHECK: DW_TAG_variable [2] -CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000037] = "armv7s_var") -CHECK: DW_AT_type [DW_FORM_ref4] (cu + 0x0030 => {0x00000030} -CHECK: DW_AT_external [DW_FORM_flag] (0x01) -CHECK: DW_AT_decl_file [DW_FORM_data1] ("/Inputs/fat-test.c") -CHECK: DW_AT_decl_line [DW_FORM_data1] (21) -CHECK: DW_AT_location [DW_FORM_block1] (DW_OP_addr 0x1000) -CHECK: DW_TAG_base_type [3] -CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000042] = "int") -CHECK: DW_AT_encoding [DW_FORM_data1] (DW_ATE_signed) -CHECK: DW_AT_byte_size [DW_FORM_data1] (0x04) -CHECK: NULL - -CHECK: .debug_line contents: -CHECK: Line table prologue: -CHECK: total_length: 0x0000002a -CHECK: version: 2 -CHECK: prologue_length: 0x00000021 -CHECK: min_inst_length: 1 -CHECK: default_is_stmt: 1 -CHECK: line_base: -5 -CHECK: line_range: 14 -CHECK: opcode_base: 13 - -CHECK: .apple_names contents: -CHECK: String: 0x00000037 "armv7s_var" -CHECK-NEXT: Data 0 [ -CHECK-NEXT: Atom[0]: 0x0000001e -CHECK-NEXT: ] - -CHECK: .apple_types contents: -CHECK: String: 0x00000042 "int" -CHECK-NEXT: Data 0 [ -CHECK-NEXT: Atom[0]: 0x00000030 -CHECK-NEXT: Atom[1]: 0x0024 -CHECK-NEXT: Atom[2]: 0x00 -CHECK-NEXT: Atom[3]: 0x0c3a28a4 -CHECK-NEXT: ] - -CHECK: /Contents/Resources/DWARF/fat-test.arm.dylib(arm64): file format Mach-O arm64 - -CHECK: .debug_info contents: -CHECK: Compile Unit: length = 0x00000038, format = DWARF32, version = 0x0002, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x0000003c) -CHECK: DW_TAG_compile_unit [1] * -CHECK: DW_AT_producer [DW_FORM_strp] ( .debug_str[0x00000001] = "clang version 3.8.0 (trunk 243776)") -CHECK: DW_AT_language [DW_FORM_data2] (DW_LANG_C99) -CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000024] = "fat-test.c") -CHECK: DW_AT_stmt_list [DW_FORM_data4] (0x00000000) -CHECK: DW_AT_comp_dir [DW_FORM_strp] ( .debug_str[0x0000002f] = "/Inputs") -CHECK: DW_TAG_variable [2] -CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000037] = "arm64_var") -CHECK: DW_AT_type [DW_FORM_ref4] (cu + 0x0034 => {0x00000034} -CHECK: DW_AT_external [DW_FORM_flag] (0x01) -CHECK: DW_AT_decl_file [DW_FORM_data1] ("/Inputs/fat-test.c") -CHECK: DW_AT_decl_line [DW_FORM_data1] (25) -CHECK: DW_AT_location [DW_FORM_block1] (DW_OP_addr 0x4000) -CHECK: DW_TAG_base_type [3] -CHECK: DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000041] = "int") -CHECK: DW_AT_encoding [DW_FORM_data1] (DW_ATE_signed) -CHECK: DW_AT_byte_size [DW_FORM_data1] (0x04) -CHECK: NULL - -CHECK: .debug_line contents: -CHECK: Line table prologue: -CHECK: total_length: 0x0000002a -CHECK: version: 2 -CHECK: prologue_length: 0x00000021 -CHECK: min_inst_length: 1 -CHECK: default_is_stmt: 1 -CHECK: line_base: -5 -CHECK: line_range: 14 -CHECK: opcode_base: 13 - -CHECK: .apple_names contents: -CHECK: String: 0x00000037 "arm64_var" -CHECK-NEXT: Data 0 [ -CHECK-NEXT: Atom[0]: 0x0000001e -CHECK-NEXT: ] - -CHECK: .apple_types contents: -CHECK: String: 0x00000041 "int" -CHECK-NEXT: Data 0 [ -CHECK-NEXT: Atom[0]: 0x00000034 -CHECK-NEXT: Atom[1]: 0x0024 -CHECK-NEXT: Atom[2]: 0x00 -CHECK-NEXT: Atom[3]: 0x0c3a28a4 -CHECK-NEXT: ] From e517c5a897c3332ce12b584ff5e522395a0e0469 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Tue, 30 May 2023 21:58:00 -0700 Subject: [PATCH 203/704] [clang] Add test for -ftabstop diagnostics I forgot to request a regression test in review of D151429, so adding one myself. --- clang/test/Misc/tabstop.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/clang/test/Misc/tabstop.c b/clang/test/Misc/tabstop.c index 5b9bda9e541c0..500bf2a834a31 100644 --- a/clang/test/Misc/tabstop.c +++ b/clang/test/Misc/tabstop.c @@ -1,6 +1,7 @@ -// RUN: %clang_cc1 -ftabstop 3 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-3 -strict-whitespace %s -// RUN: %clang_cc1 -ftabstop 4 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-4 -strict-whitespace %s -// RUN: %clang_cc1 -ftabstop 5 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-5 -strict-whitespace %s +// RUN: %clang_cc1 -ftabstop 3 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-3 -strict-whitespace %s +// RUN: %clang_cc1 -ftabstop 4 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-4 -strict-whitespace %s +// RUN: %clang_cc1 -ftabstop 5 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-5 -strict-whitespace %s +// RUN: %clang_cc1 -ftabstop 101 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-101 -strict-whitespace %s // tab void* a = 1; @@ -54,3 +55,5 @@ void f(void) // CHECK-5: {{^ }} ( ) // CHECK-5: {{^ }}if (1 == 0 & 1) // CHECK-5: {{^ }} ( ) + +// CHECK-101: warning: ignoring invalid -ftabstop value '101', using default value 8 From f63155aaa6467bd2610820dfd1996af3bb6029a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 17 May 2023 14:04:30 +0200 Subject: [PATCH 204/704] [clang] Show line numbers in diagnostic code snippets Show line numbers to the left of diagnostic code snippets and increase the numbers of lines shown from 1 to 16. Differential Revision: https://reviews.llvm.org/D147875 --- .../cert/uppercase-literal-suffix-integer.cpp | 16 ++-- .../uppercase-literal-suffix-float16.cpp | 6 +- ...eral-suffix-floating-point-opencl-half.cpp | 4 +- ...ppercase-literal-suffix-floating-point.cpp | 22 ++--- ...eral-suffix-hexadecimal-floating-point.cpp | 18 ++--- ...ase-literal-suffix-integer-custom-list.cpp | 8 +- .../uppercase-literal-suffix-integer-ms.cpp | 8 +- .../uppercase-literal-suffix-integer.cpp | 36 ++++----- clang/docs/ReleaseNotes.rst | 5 ++ clang/docs/UsersManual.rst | 32 ++++++++ .../include/clang/Basic/DiagnosticOptions.def | 2 + clang/include/clang/Basic/DiagnosticOptions.h | 3 +- clang/include/clang/Driver/Options.td | 4 + clang/include/clang/Frontend/TextDiagnostic.h | 3 +- clang/lib/Driver/ToolChains/Clang.cpp | 3 + clang/lib/Frontend/TextDiagnostic.cpp | 50 ++++++++++-- .../WebKit/uncounted-lambda-captures.cpp | 8 +- clang/test/FixIt/fixit-function-call.cpp | 2 +- clang/test/FixIt/fixit-newline-style.c | 2 +- .../FixIt/fixit-unicode-with-utf8-output.c | 2 +- clang/test/FixIt/fixit-unicode.c | 4 +- clang/test/Frontend/source-col-map.c | 2 +- clang/test/Lexer/header.cpp | 2 +- clang/test/Lexer/string-literal-errors.cpp | 2 +- clang/test/Misc/caret-diags-macros.c | 2 +- clang/test/Misc/caret-diags-multiline.cpp | 2 +- clang/test/Misc/diag-macro-backtrace.c | 2 +- clang/test/Misc/message-length.c | 2 +- clang/test/Misc/tabstop.c | 8 +- clang/test/Misc/unnecessary-elipses.cpp | 2 +- clang/test/Misc/unprintable.c | 2 +- clang/test/Misc/wrong-encoding.c | 2 +- clang/test/Parser/brackets.c | 2 +- clang/test/Parser/brackets.cpp | 2 +- clang/test/Preprocessor/ucn-pp-identifier.c | 6 +- clang/test/Sema/caret-diags-complex-init.cpp | 2 +- clang/test/SemaCXX/struct-class-redecl.cpp | 4 +- .../diagnostics/TestExprDiagnostics.py | 81 ++++++++++++++++--- 38 files changed, 254 insertions(+), 109 deletions(-) diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/uppercase-literal-suffix-integer.cpp b/clang-tools-extra/test/clang-tidy/checkers/cert/uppercase-literal-suffix-integer.cpp index 0dc06df4f18b4..6fa700bf06d4f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/uppercase-literal-suffix-integer.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cert/uppercase-literal-suffix-integer.cpp @@ -31,7 +31,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'l', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v5 = 1l; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}L{{$}} + // CHECK-MESSAGES-NEXT: {{^ *| *}}L{{$}} // CHECK-FIXES: static constexpr auto v5 = 1L; static_assert(is_same::value, ""); static_assert(v5 == 1, ""); @@ -46,7 +46,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'll', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v7 = 1ll; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LL{{$}} + // CHECK-MESSAGES-NEXT: {{^ *| *}}LL{{$}} // CHECK-FIXES: static constexpr auto v7 = 1LL; static_assert(is_same::value, ""); static_assert(v7 == 1, ""); @@ -79,7 +79,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'lu', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v13 = 1lu; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LU{{$}} + // CHECK-MESSAGES-NEXT: {{^ *| *}}LU{{$}} // CHECK-FIXES: static constexpr auto v13 = 1LU; static_assert(is_same::value, ""); static_assert(v13 == 1, ""); @@ -88,7 +88,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'Lu', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v14 = 1Lu; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LU{{$}} + // CHECK-MESSAGES-NEXT: {{^ *| *}}LU{{$}} // CHECK-FIXES: static constexpr auto v14 = 1LU; static_assert(is_same::value, ""); static_assert(v14 == 1, ""); @@ -97,7 +97,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'lU', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v15 = 1lU; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LU{{$}} + // CHECK-MESSAGES-NEXT: {{^ *| *}}LU{{$}} // CHECK-FIXES: static constexpr auto v15 = 1LU; static_assert(is_same::value, ""); static_assert(v15 == 1, ""); @@ -130,7 +130,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'llu', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v21 = 1llu; // CHECK-MESSAGES-NEXT: ^~~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LLU{{$}} + // CHECK-MESSAGES-NEXT: {{^ *| *}}LLU{{$}} // CHECK-FIXES: static constexpr auto v21 = 1LLU; static_assert(is_same::value, ""); static_assert(v21 == 1, ""); @@ -139,7 +139,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'LLu', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v22 = 1LLu; // CHECK-MESSAGES-NEXT: ^~~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LLU{{$}} + // CHECK-MESSAGES-NEXT: {{^ *| *}}LLU{{$}} // CHECK-FIXES: static constexpr auto v22 = 1LLU; static_assert(is_same::value, ""); static_assert(v22 == 1, ""); @@ -148,7 +148,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'llU', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v23 = 1llU; // CHECK-MESSAGES-NEXT: ^~~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LLU{{$}} + // CHECK-MESSAGES-NEXT: {{^ *| *}}LLU{{$}} // CHECK-FIXES: static constexpr auto v23 = 1LLU; static_assert(is_same::value, ""); static_assert(v23 == 1, ""); diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-float16.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-float16.cpp index a790597fcff51..46d7bc1347d0d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-float16.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-float16.cpp @@ -9,7 +9,7 @@ void float16_normal_literals() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'f16', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v14 = 1.f16; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F16{{$}} + // CHECK-MESSAGES-NEXT: F16{{$}} // CHECK-FIXES: static constexpr auto v14 = 1.F16; static_assert(is_same::value, ""); static_assert(v14 == 1.F16, ""); @@ -18,7 +18,7 @@ void float16_normal_literals() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'f16', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v15 = 1.e0f16; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F16{{$}} + // CHECK-MESSAGES-NEXT: F16{{$}} // CHECK-FIXES: static constexpr auto v15 = 1.e0F16; static_assert(is_same::value, ""); static_assert(v15 == 1.F16, ""); @@ -39,7 +39,7 @@ void float16_hexadecimal_literals() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'f16', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v13 = 0xfp0f16; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F16{{$}} + // CHECK-MESSAGES-NEXT: F16{{$}} // CHECK-FIXES: static constexpr auto v13 = 0xfp0F16; static_assert(is_same::value, ""); static_assert(v13 == 0xfp0F16, ""); diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-floating-point-opencl-half.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-floating-point-opencl-half.cpp index 8cea2a4d827cb..ef905da6e9f95 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-floating-point-opencl-half.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-floating-point-opencl-half.cpp @@ -14,14 +14,14 @@ void floating_point_half_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: floating point literal has suffix 'h', which is not uppercase // CHECK-MESSAGES-NEXT: static half v2 = 1.h; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}H{{$}} + // CHECK-MESSAGES-NEXT: H{{$}} // CHECK-HIXES: static half v2 = 1.H; static half v3 = 1.e0h; // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: floating point literal has suffix 'h', which is not uppercase // CHECK-MESSAGES-NEXT: static half v3 = 1.e0h; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}H{{$}} + // CHECK-MESSAGES-NEXT: H{{$}} // CHECK-HIXES: static half v3 = 1.e0H; static half v4 = 1.H; // OK. diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-floating-point.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-floating-point.cpp index a0dcc38c8b231..d9f5bfbe3aa38 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-floating-point.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-floating-point.cpp @@ -20,7 +20,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: floating point literal has suffix 'f', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v2 = 1.f; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F{{$}} + // CHECK-MESSAGES-NEXT: F{{$}} // CHECK-FIXES: static constexpr auto v2 = 1.F; static_assert(is_same::value, ""); static_assert(v2 == 1.0F, ""); @@ -29,7 +29,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: floating point literal has suffix 'f', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v3 = 1.e0f; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F{{$}} + // CHECK-MESSAGES-NEXT: F{{$}} // CHECK-FIXES: static constexpr auto v3 = 1.e0F; static_assert(is_same::value, ""); static_assert(v3 == 1.0F, ""); @@ -48,7 +48,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: floating point literal has suffix 'l', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v6 = 1.l; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}L{{$}} + // CHECK-MESSAGES-NEXT: L{{$}} // CHECK-FIXES: static constexpr auto v6 = 1.L; static_assert(is_same::value, ""); static_assert(v6 == 1., ""); @@ -57,7 +57,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: floating point literal has suffix 'l', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v7 = 1.e0l; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}L{{$}} + // CHECK-MESSAGES-NEXT: L{{$}} // CHECK-FIXES: static constexpr auto v7 = 1.e0L; static_assert(is_same::value, ""); static_assert(v7 == 1., ""); @@ -76,7 +76,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'q', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v10 = 1.q; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}Q{{$}} + // CHECK-MESSAGES-NEXT: Q{{$}} // CHECK-FIXES: static constexpr auto v10 = 1.Q; static_assert(is_same::value, ""); static_assert(v10 == 1., ""); @@ -85,7 +85,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'q', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v11 = 1.e0q; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}Q{{$}} + // CHECK-MESSAGES-NEXT: Q{{$}} // CHECK-FIXES: static constexpr auto v11 = 1.e0Q; static_assert(is_same::value, ""); static_assert(v11 == 1., ""); @@ -106,7 +106,7 @@ void floating_point_complex_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'i', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v14 = 1.i; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}I{{$}} + // CHECK-MESSAGES-NEXT: I{{$}} // CHECK-FIXES: static constexpr auto v14 = 1.I; static_assert(is_same::value, ""); static_assert(v14 == 1.I, ""); @@ -115,7 +115,7 @@ void floating_point_complex_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'i', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v15 = 1.e0i; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}I{{$}} + // CHECK-MESSAGES-NEXT: I{{$}} // CHECK-FIXES: static constexpr auto v15 = 1.e0I; static_assert(is_same::value, ""); static_assert(v15 == 1.I, ""); @@ -134,7 +134,7 @@ void floating_point_complex_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'j', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v18 = 1.j; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}J{{$}} + // CHECK-MESSAGES-NEXT: J{{$}} // CHECK-FIXES: static constexpr auto v18 = 1.J; static_assert(is_same::value, ""); static_assert(v18 == 1.J, ""); @@ -143,7 +143,7 @@ void floating_point_complex_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'j', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v19 = 1.e0j; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}J{{$}} + // CHECK-MESSAGES-NEXT: J{{$}} // CHECK-FIXES: static constexpr auto v19 = 1.e0J; static_assert(is_same::value, ""); static_assert(v19 == 1.J, ""); @@ -163,7 +163,7 @@ void macros() { // CHECK-MESSAGES: :[[@LINE-1]]:42: warning: floating point literal has suffix 'f', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto m0 = PASSTHROUGH(1.f); // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F{{$}} + // CHECK-MESSAGES-NEXT: F{{$}} // CHECK-FIXES: static constexpr auto m0 = PASSTHROUGH(1.F); static_assert(is_same::value, ""); static_assert(m0 == 1.0F, ""); diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-hexadecimal-floating-point.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-hexadecimal-floating-point.cpp index 57d24fb5712c4..72077153fb718 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-hexadecimal-floating-point.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-hexadecimal-floating-point.cpp @@ -16,7 +16,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: floating point literal has suffix 'f', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v1 = 0xfp0f; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F{{$}} + // CHECK-MESSAGES-NEXT: F{{$}} // CHECK-FIXES: static constexpr auto v1 = 0xfp0F; static_assert(is_same::value, ""); static_assert(v1 == 15, ""); @@ -29,7 +29,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: floating point literal has suffix 'f', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v3 = 0xfP0f; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F{{$}} + // CHECK-MESSAGES-NEXT: F{{$}} // CHECK-FIXES: static constexpr auto v3 = 0xfP0F; static_assert(is_same::value, ""); static_assert(v3 == 15, ""); @@ -42,7 +42,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: floating point literal has suffix 'f', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v5 = 0xFP0f; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F{{$}} + // CHECK-MESSAGES-NEXT: F{{$}} // CHECK-FIXES: static constexpr auto v5 = 0xFP0F; static_assert(is_same::value, ""); static_assert(v5 == 15, ""); @@ -55,7 +55,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: floating point literal has suffix 'f', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v7 = 0xFp0f; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F{{$}} + // CHECK-MESSAGES-NEXT: F{{$}} // CHECK-FIXES: static constexpr auto v7 = 0xFp0F; static_assert(is_same::value, ""); static_assert(v7 == 15, ""); @@ -70,7 +70,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: floating point literal has suffix 'l', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v9 = 0xfp0l; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}L{{$}} + // CHECK-MESSAGES-NEXT: L{{$}} // CHECK-FIXES: static constexpr auto v9 = 0xfp0L; static_assert(is_same::value, ""); static_assert(v9 == 0xfp0, ""); @@ -85,7 +85,7 @@ void floating_point_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'q', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v11 = 0xfp0q; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}Q{{$}} + // CHECK-MESSAGES-NEXT: Q{{$}} // CHECK-FIXES: static constexpr auto v11 = 0xfp0Q; static_assert(is_same::value, ""); static_assert(v11 == 0xfp0, ""); @@ -102,7 +102,7 @@ void floating_point_complex_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'i', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v14 = 0xfp0i; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}I{{$}} + // CHECK-MESSAGES-NEXT: I{{$}} // CHECK-FIXES: static constexpr auto v14 = 0xfp0I; static_assert(is_same::value, ""); static_assert(v14 == 0xfp0I, ""); @@ -117,7 +117,7 @@ void floating_point_complex_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: floating point literal has suffix 'j', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v18 = 0xfp0j; // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}J{{$}} + // CHECK-MESSAGES-NEXT: J{{$}} // CHECK-FIXES: static constexpr auto v18 = 0xfp0J; static_assert(is_same::value, ""); static_assert(v18 == 0xfp0J, ""); @@ -133,7 +133,7 @@ void macros() { // CHECK-MESSAGES: :[[@LINE-1]]:42: warning: floating point literal has suffix 'f', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto m0 = PASSTHROUGH(0x0p0f); // CHECK-MESSAGES-NEXT: ^ ~ - // CHECK-MESSAGES-NEXT: {{^ *}}F{{$}} + // CHECK-MESSAGES-NEXT: F{{$}} // CHECK-FIXES: static constexpr auto m0 = PASSTHROUGH(0x0p0F); static_assert(is_same::value, ""); static_assert(m0 == 0x0p0F, ""); diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-custom-list.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-custom-list.cpp index c787cab1fc4a7..5ee09527999eb 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-custom-list.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-custom-list.cpp @@ -22,7 +22,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'l', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v5 = 1l; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}L{{$}} + // CHECK-MESSAGES-NEXT: L{{$}} // CHECK-FIXES: static constexpr auto v5 = 1L; static_assert(is_same::value, ""); static_assert(v5 == 1, ""); @@ -47,7 +47,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'ul', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v9 = 1ul; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}uL{{$}} + // CHECK-MESSAGES-NEXT: uL{{$}} // CHECK-FIXES: static constexpr auto v9 = 1uL; static_assert(is_same::value, ""); static_assert(v9 == 1, ""); @@ -60,7 +60,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'Ul', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v11 = 1Ul; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}uL{{$}} + // CHECK-MESSAGES-NEXT: uL{{$}} // CHECK-FIXES: static constexpr auto v11 = 1uL; static_assert(is_same::value, ""); static_assert(v11 == 1, ""); @@ -69,7 +69,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'UL', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v12 = 1UL; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}uL{{$}} + // CHECK-MESSAGES-NEXT: uL{{$}} // CHECK-FIXES: static constexpr auto v12 = 1uL; static_assert(is_same::value, ""); static_assert(v12 == 1, ""); diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-ms.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-ms.cpp index fe3269f2c506d..7ac4a7502e7b7 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-ms.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-ms.cpp @@ -21,7 +21,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'i32', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v3 = 1i32; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}I32{{$}} + // CHECK-MESSAGES-NEXT: I32{{$}} // CHECK-FIXES: static constexpr auto v3 = 1I32; static_assert(is_same::value, ""); static_assert(v3 == 1I32, ""); @@ -36,7 +36,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'i64', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v5 = 1i64; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}I64{{$}} + // CHECK-MESSAGES-NEXT: I64{{$}} // CHECK-FIXES: static constexpr auto v5 = 1I64; static_assert(is_same::value, ""); static_assert(v5 == 1I64, ""); @@ -51,7 +51,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'i16', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v7 = 1i16; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}I16{{$}} + // CHECK-MESSAGES-NEXT: I16{{$}} // CHECK-FIXES: static constexpr auto v7 = 1I16; static_assert(is_same::value, ""); static_assert(v7 == 1I16, ""); @@ -66,7 +66,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'i8', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v9 = 1i8; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}I8{{$}} + // CHECK-MESSAGES-NEXT: I8{{$}} // CHECK-FIXES: static constexpr auto v9 = 1I8; static_assert(is_same::value, ""); static_assert(v9 == 1I8, ""); diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer.cpp index c03ccc23992fc..084d9f68e0b5e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer.cpp @@ -21,7 +21,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'u', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v3 = 1u; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}U{{$}} + // CHECK-MESSAGES-NEXT: U{{$}} // CHECK-FIXES: static constexpr auto v3 = 1U; static_assert(is_same::value, ""); static_assert(v3 == 1, ""); @@ -36,7 +36,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'l', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v5 = 1l; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}L{{$}} + // CHECK-MESSAGES-NEXT: L{{$}} // CHECK-FIXES: static constexpr auto v5 = 1L; static_assert(is_same::value, ""); static_assert(v5 == 1, ""); @@ -51,7 +51,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'll', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v7 = 1ll; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LL{{$}} + // CHECK-MESSAGES-NEXT: LL{{$}} // CHECK-FIXES: static constexpr auto v7 = 1LL; static_assert(is_same::value, ""); static_assert(v7 == 1, ""); @@ -66,7 +66,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: integer literal has suffix 'ul', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v9 = 1ul; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}UL{{$}} + // CHECK-MESSAGES-NEXT: UL{{$}} // CHECK-FIXES: static constexpr auto v9 = 1UL; static_assert(is_same::value, ""); static_assert(v9 == 1, ""); @@ -75,7 +75,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'uL', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v10 = 1uL; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}UL{{$}} + // CHECK-MESSAGES-NEXT: UL{{$}} // CHECK-FIXES: static constexpr auto v10 = 1UL; static_assert(is_same::value, ""); static_assert(v10 == 1, ""); @@ -84,7 +84,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'Ul', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v11 = 1Ul; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}UL{{$}} + // CHECK-MESSAGES-NEXT: UL{{$}} // CHECK-FIXES: static constexpr auto v11 = 1UL; static_assert(is_same::value, ""); static_assert(v11 == 1, ""); @@ -99,7 +99,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'lu', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v13 = 1lu; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LU{{$}} + // CHECK-MESSAGES-NEXT: LU{{$}} // CHECK-FIXES: static constexpr auto v13 = 1LU; static_assert(is_same::value, ""); static_assert(v13 == 1, ""); @@ -108,7 +108,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'Lu', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v14 = 1Lu; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LU{{$}} + // CHECK-MESSAGES-NEXT: LU{{$}} // CHECK-FIXES: static constexpr auto v14 = 1LU; static_assert(is_same::value, ""); static_assert(v14 == 1, ""); @@ -117,7 +117,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'lU', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v15 = 1lU; // CHECK-MESSAGES-NEXT: ^~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LU{{$}} + // CHECK-MESSAGES-NEXT: LU{{$}} // CHECK-FIXES: static constexpr auto v15 = 1LU; static_assert(is_same::value, ""); static_assert(v15 == 1, ""); @@ -132,7 +132,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'ull', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v17 = 1ull; // CHECK-MESSAGES-NEXT: ^~~~ - // CHECK-MESSAGES-NEXT: {{^ *}}ULL{{$}} + // CHECK-MESSAGES-NEXT: ULL{{$}} // CHECK-FIXES: static constexpr auto v17 = 1ULL; static_assert(is_same::value, ""); static_assert(v17 == 1, ""); @@ -141,7 +141,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'uLL', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v18 = 1uLL; // CHECK-MESSAGES-NEXT: ^~~~ - // CHECK-MESSAGES-NEXT: {{^ *}}ULL{{$}} + // CHECK-MESSAGES-NEXT: ULL{{$}} // CHECK-FIXES: static constexpr auto v18 = 1ULL; static_assert(is_same::value, ""); static_assert(v18 == 1, ""); @@ -150,7 +150,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'Ull', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v19 = 1Ull; // CHECK-MESSAGES-NEXT: ^~~~ - // CHECK-MESSAGES-NEXT: {{^ *}}ULL{{$}} + // CHECK-MESSAGES-NEXT: ULL{{$}} // CHECK-FIXES: static constexpr auto v19 = 1ULL; static_assert(is_same::value, ""); static_assert(v19 == 1, ""); @@ -165,7 +165,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'llu', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v21 = 1llu; // CHECK-MESSAGES-NEXT: ^~~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LLU{{$}} + // CHECK-MESSAGES-NEXT: LLU{{$}} // CHECK-FIXES: static constexpr auto v21 = 1LLU; static_assert(is_same::value, ""); static_assert(v21 == 1, ""); @@ -174,7 +174,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'LLu', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v22 = 1LLu; // CHECK-MESSAGES-NEXT: ^~~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LLU{{$}} + // CHECK-MESSAGES-NEXT: LLU{{$}} // CHECK-FIXES: static constexpr auto v22 = 1LLU; static_assert(is_same::value, ""); static_assert(v22 == 1, ""); @@ -183,7 +183,7 @@ void integer_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'llU', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v23 = 1llU; // CHECK-MESSAGES-NEXT: ^~~~ - // CHECK-MESSAGES-NEXT: {{^ *}}LLU{{$}} + // CHECK-MESSAGES-NEXT: LLU{{$}} // CHECK-FIXES: static constexpr auto v23 = 1LLU; static_assert(is_same::value, ""); static_assert(v23 == 1, ""); @@ -200,7 +200,7 @@ void integer_complex_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'i', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v25 = 1i; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}I{{$}} + // CHECK-MESSAGES-NEXT: I{{$}} // CHECK-FIXES: static constexpr auto v25 = 1I; static_assert(is_same::value, ""); static_assert(v25 == 1I, ""); @@ -215,7 +215,7 @@ void integer_complex_suffix() { // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: integer literal has suffix 'j', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto v27 = 1j; // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}J{{$}} + // CHECK-MESSAGES-NEXT: J{{$}} // CHECK-FIXES: static constexpr auto v27 = 1J; static_assert(is_same::value, ""); static_assert(v27 == 1J, ""); @@ -231,7 +231,7 @@ void macros() { // CHECK-MESSAGES: :[[@LINE-1]]:42: warning: integer literal has suffix 'u', which is not uppercase // CHECK-MESSAGES-NEXT: static constexpr auto m0 = PASSTHROUGH(1u); // CHECK-MESSAGES-NEXT: ^~ - // CHECK-MESSAGES-NEXT: {{^ *}}U{{$}} + // CHECK-MESSAGES-NEXT: U{{$}} // CHECK-FIXES: static constexpr auto m0 = PASSTHROUGH(1U); static_assert(is_same::value, ""); static_assert(m0 == 1, ""); diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8a9dd04cda5bb..b5814350a5f11 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -314,6 +314,11 @@ Improvements to Clang's diagnostics is an incomplete type. (`#55175: `_, and fixes an incorrect mention of ``alignof`` in a diagnostic about ``alignas``). +- Clang will now show a margin with line numbers to the left of each line + of code it prints for diagnostics. This can be disabled using + ``-fno-diagnostics-show-line-numbers``. At the same time, the maximum + number of code lines it prints has been increased from 1 to 16. This + can be controlled using ``-fcaret-diagnostics-max-lines=``. Bug Fixes in This Version ------------------------- diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst index cee037a7ea89d..6b597242e3f93 100644 --- a/clang/docs/UsersManual.rst +++ b/clang/docs/UsersManual.rst @@ -575,6 +575,38 @@ output format of the diagnostics that it generates. [...]>>> +.. option:: -fcaret-diagnostics-max-lines: + + Controls how many lines of code clang prints for diagnostics. By default, + clang prints a maximum of 16 lines of code. + + +.. option:: -fdiagnostics-show-line-numbers: + + Controls whether clang will print a margin containing the line number on + the left of each line of code it prints for diagnostics. + + Default: + + :: + + test.cpp:5:1: error: 'main' must return 'int' + 5 | void main() {} + | ^~~~ + | int + + + With -fno-diagnostics-show-line-numbers: + + :: + + test.cpp:5:1: error: 'main' must return 'int' + void main() {} + ^~~~ + int + + + .. _cl_diag_warning_groups: Individual Warning Groups diff --git a/clang/include/clang/Basic/DiagnosticOptions.def b/clang/include/clang/Basic/DiagnosticOptions.def index 1d6d55a411e78..6d0c1b14acc12 100644 --- a/clang/include/clang/Basic/DiagnosticOptions.def +++ b/clang/include/clang/Basic/DiagnosticOptions.def @@ -90,6 +90,8 @@ VALUE_DIAGOPT(ConstexprBacktraceLimit, 32, DefaultConstexprBacktraceLimit) VALUE_DIAGOPT(SpellCheckingLimit, 32, DefaultSpellCheckingLimit) /// Limit number of lines shown in a snippet. VALUE_DIAGOPT(SnippetLineLimit, 32, DefaultSnippetLineLimit) +/// Show line number column on the left of snippets. +VALUE_DIAGOPT(ShowLineNumbers, 1, DefaultShowLineNumbers) VALUE_DIAGOPT(TabStop, 32, DefaultTabStop) /// The distance between tab stops. /// Column limit for formatting message diagnostics, or 0 if unused. diff --git a/clang/include/clang/Basic/DiagnosticOptions.h b/clang/include/clang/Basic/DiagnosticOptions.h index 4b0d45a3ff7c7..7e218b9c71e69 100644 --- a/clang/include/clang/Basic/DiagnosticOptions.h +++ b/clang/include/clang/Basic/DiagnosticOptions.h @@ -84,7 +84,8 @@ class DiagnosticOptions : public RefCountedBase{ DefaultTemplateBacktraceLimit = 10, DefaultConstexprBacktraceLimit = 10, DefaultSpellCheckingLimit = 50, - DefaultSnippetLineLimit = 1, + DefaultSnippetLineLimit = 16, + DefaultShowLineNumbers = 1, }; // Define simple diagnostic options (with no accessors). diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index f6240f86447cb..41bd1c00eb229 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2566,6 +2566,10 @@ defm operator_names : BoolFOption<"operator-names", def fdiagnostics_absolute_paths : Flag<["-"], "fdiagnostics-absolute-paths">, Group, Flags<[CC1Option, CoreOption]>, HelpText<"Print absolute paths in diagnostics">, MarshallingInfoFlag>; +defm diagnostics_show_line_numbers : BoolFOption<"diagnostics-show-line-numbers", + DiagnosticOpts<"ShowLineNumbers">, DefaultTrue, + NegFlag, + PosFlag>; def fno_stack_protector : Flag<["-"], "fno-stack-protector">, Group, HelpText<"Disable the use of stack protectors">; def fno_strict_aliasing : Flag<["-"], "fno-strict-aliasing">, Group, diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h index a2eec46beccd1..7eb0ab0cdc9bc 100644 --- a/clang/include/clang/Frontend/TextDiagnostic.h +++ b/clang/include/clang/Frontend/TextDiagnostic.h @@ -103,7 +103,8 @@ class TextDiagnostic : public DiagnosticRenderer { SmallVectorImpl &Ranges, ArrayRef Hints); - void emitSnippet(StringRef SourceLine); + void emitSnippet(StringRef SourceLine, unsigned MaxLineNoDisplayWidth, + unsigned LineNo); void emitParseableFixits(ArrayRef Hints, const SourceManager &SM); }; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index d5e8718641754..e22c2ce7f2ede 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4111,6 +4111,9 @@ static void RenderDiagnosticsOptions(const Driver &D, const ArgList &Args, Args.addOptOutFlag(CmdArgs, options::OPT_fshow_source_location, options::OPT_fno_show_source_location); + Args.addOptOutFlag(CmdArgs, options::OPT_fdiagnostics_show_line_numbers, + options::OPT_fno_diagnostics_show_line_numbers); + if (Args.hasArg(options::OPT_fdiagnostics_absolute_paths)) CmdArgs.push_back("-fdiagnostics-absolute-paths"); diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp index 08f84d28bb852..baf9b017fc83e 100644 --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -471,9 +471,7 @@ static void selectInterestingSourceRegion(std::string &SourceLine, CaretEnd = map.byteToColumn(SourceEnd) + CaretColumnsOutsideSource; // [CaretStart, CaretEnd) is the slice we want. Update the various - // output lines to show only this slice, with two-space padding - // before the lines so that it looks nicer. - + // output lines to show only this slice. assert(CaretStart!=(unsigned)-1 && CaretEnd!=(unsigned)-1 && SourceStart!=(unsigned)-1 && SourceEnd!=(unsigned)-1); assert(SourceStart <= SourceEnd); @@ -1120,6 +1118,14 @@ static std::string buildFixItInsertionLine(FileID FID, return FixItInsertionLine; } +static unsigned getNumDisplayWidth(unsigned N) { + unsigned L = 1u, M = 10u; + while (M <= N && ++L != std::numeric_limits::digits10 + 1) + M *= 10u; + + return L; +} + /// Emit a code snippet and caret line. /// /// This routine emits a single line's code snippet and caret line.. @@ -1172,7 +1178,26 @@ void TextDiagnostic::emitSnippetAndCaret( Lines = maybeAddRange(Lines, *OptionalRange, MaxLines); } - for (unsigned LineNo = Lines.first; LineNo != Lines.second + 1; ++LineNo) { + // Our line numbers look like: + // " [number] | " + // Where [number] is MaxLineNoDisplayWidth columns + // and the full thing is therefore MaxLineNoDisplayWidth + 4 columns. + unsigned DisplayLineNo = Loc.getPresumedLoc().getLine(); + unsigned MaxLineNoDisplayWidth = + DiagOpts->ShowLineNumbers + ? std::max(4u, getNumDisplayWidth(DisplayLineNo + MaxLines)) + : 0; + auto indentForLineNumbers = [&] { + if (MaxLineNoDisplayWidth > 0) { + OS << ' '; + for (unsigned I = 0; I != MaxLineNoDisplayWidth; ++I) + OS << ' '; + OS << " | "; + } + }; + + for (unsigned LineNo = Lines.first; LineNo != Lines.second + 1; + ++LineNo, ++DisplayLineNo) { const char *BufStart = BufData.data(); const char *BufEnd = BufStart + BufData.size(); @@ -1245,9 +1270,10 @@ void TextDiagnostic::emitSnippetAndCaret( CaretLine.erase(CaretLine.end() - 1); // Emit what we have computed. - emitSnippet(SourceLine); + emitSnippet(SourceLine, MaxLineNoDisplayWidth, DisplayLineNo); if (!CaretLine.empty()) { + indentForLineNumbers(); if (DiagOpts->ShowColors) OS.changeColor(caretColor, true); OS << CaretLine << '\n'; @@ -1256,6 +1282,7 @@ void TextDiagnostic::emitSnippetAndCaret( } if (!FixItInsertionLine.empty()) { + indentForLineNumbers(); if (DiagOpts->ShowColors) // Print fixit line in color OS.changeColor(fixitColor, false); @@ -1271,7 +1298,8 @@ void TextDiagnostic::emitSnippetAndCaret( emitParseableFixits(Hints, SM); } -void TextDiagnostic::emitSnippet(StringRef line) { +void TextDiagnostic::emitSnippet(StringRef line, unsigned MaxLineNoDisplayWidth, + unsigned LineNo) { if (line.empty()) return; @@ -1280,6 +1308,16 @@ void TextDiagnostic::emitSnippet(StringRef line) { std::string to_print; bool print_reversed = false; + // Emit line number. + if (MaxLineNoDisplayWidth > 0) { + unsigned LineNoDisplayWidth = getNumDisplayWidth(LineNo); + OS << ' '; + for (unsigned I = LineNoDisplayWidth; I < MaxLineNoDisplayWidth; ++I) + OS << ' '; + OS << LineNo; + OS << " | "; + } + while (i,bool> res = printableTextForNextCharacter(line, &i, DiagOpts->TabStop); diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp index 85dd77f9a8774..30798793ceab1 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp @@ -5,14 +5,14 @@ void raw_ptr() { RefCountable* ref_countable = nullptr; auto foo1 = [ref_countable](){}; // CHECK: warning: Captured raw-pointer 'ref_countable' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker] - // CHECK-NEXT:{{^}} auto foo1 = [ref_countable](){}; - // CHECK-NEXT:{{^}} ^ + // CHECK-NEXT:{{^ 6 | }} auto foo1 = [ref_countable](){}; + // CHECK-NEXT:{{^ | }} ^ auto foo2 = [&ref_countable](){}; // CHECK: warning: Captured raw-pointer 'ref_countable' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker] auto foo3 = [&](){ ref_countable = nullptr; }; // CHECK: warning: Implicitly captured raw-pointer 'ref_countable' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker] - // CHECK-NEXT:{{^}} auto foo3 = [&](){ ref_countable = nullptr; }; - // CHECK-NEXT:{{^}} ^ + // CHECK-NEXT:{{^ 12 | }} auto foo3 = [&](){ ref_countable = nullptr; }; + // CHECK-NEXT:{{^ | }} ^ auto foo4 = [=](){ (void) ref_countable; }; // CHECK: warning: Implicitly captured raw-pointer 'ref_countable' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker] } diff --git a/clang/test/FixIt/fixit-function-call.cpp b/clang/test/FixIt/fixit-function-call.cpp index 88f636ea5859d..17c50adffc4a2 100644 --- a/clang/test/FixIt/fixit-function-call.cpp +++ b/clang/test/FixIt/fixit-function-call.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fdiagnostics-parseable-fixits -x c++ %s 2> %t +// RUN: not %clang_cc1 -fdiagnostics-parseable-fixits -fno-diagnostics-show-line-numbers -fcaret-diagnostics-max-lines 1 -x c++ %s 2> %t // RUN: FileCheck %s < %t // PR5941 // END. diff --git a/clang/test/FixIt/fixit-newline-style.c b/clang/test/FixIt/fixit-newline-style.c index 8c5f424c39873..091b79426bcdf 100644 --- a/clang/test/FixIt/fixit-newline-style.c +++ b/clang/test/FixIt/fixit-newline-style.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -pedantic -Wunused-label -x c %s 2>&1 | FileCheck %s -strict-whitespace +// RUN: %clang_cc1 -pedantic -Wunused-label -fno-diagnostics-show-line-numbers -x c %s 2>&1 | FileCheck %s -strict-whitespace // This file intentionally uses a CRLF newline style // diff --git a/clang/test/FixIt/fixit-unicode-with-utf8-output.c b/clang/test/FixIt/fixit-unicode-with-utf8-output.c index a1a7bb7338095..f5aec89939adf 100644 --- a/clang/test/FixIt/fixit-unicode-with-utf8-output.c +++ b/clang/test/FixIt/fixit-unicode-with-utf8-output.c @@ -1,7 +1,7 @@ // This test is an additional set of checks for the fixit-unicode.c test for // systems capable of outputting Unicode characters to the standard output in // the UTF-8 encoding. -// RUN: not %clang_cc1 -fsyntax-only %S/fixit-unicode.c 2>&1 | FileCheck -strict-whitespace %s +// RUN: not %clang_cc1 -fsyntax-only -fno-diagnostics-show-line-numbers %S/fixit-unicode.c 2>&1 | FileCheck -strict-whitespace %s // REQUIRES: utf8-capable-terminal // CHECK: warning: format specifies type 'int' but the argument has type 'long' diff --git a/clang/test/FixIt/fixit-unicode.c b/clang/test/FixIt/fixit-unicode.c index 70c9751a2bcd6..87819cdfbea17 100644 --- a/clang/test/FixIt/fixit-unicode.c +++ b/clang/test/FixIt/fixit-unicode.c @@ -2,8 +2,8 @@ // There's a set of additional checks for systems with proper support of UTF-8 // on the standard output in fixit-unicode-with-utf8-output.c. -// RUN: not %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck -strict-whitespace %s -// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck -check-prefix=CHECK-MACHINE %s +// RUN: not %clang_cc1 -fsyntax-only -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck -strict-whitespace %s +// RUN: not %clang_cc1 -fsyntax-only -fno-diagnostics-show-line-numbers -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck -check-prefix=CHECK-MACHINE %s struct Foo { int bar; diff --git a/clang/test/Frontend/source-col-map.c b/clang/test/Frontend/source-col-map.c index b257261b8b2bc..20030a28f5c19 100644 --- a/clang/test/Frontend/source-col-map.c +++ b/clang/test/Frontend/source-col-map.c @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fsyntax-only -fmessage-length=75 -o /dev/null -x c < %s 2>&1 | FileCheck %s -strict-whitespace +// RUN: not %clang_cc1 -fsyntax-only -fno-diagnostics-show-line-numbers -fmessage-length=75 -o /dev/null -x c < %s 2>&1 | FileCheck %s -strict-whitespace // REQUIRES: utf8-capable-terminal // Test case for the text diagnostics source column conversion crash. diff --git a/clang/test/Lexer/header.cpp b/clang/test/Lexer/header.cpp index f02b1e69db439..34ed2686b0e72 100644 --- a/clang/test/Lexer/header.cpp +++ b/clang/test/Lexer/header.cpp @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -Wno-header-guard %s -// RUN: %clang_cc1 -fsyntax-only -Wheader-guard %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -fsyntax-only -Wheader-guard -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck %s #include "Inputs/good-header-guard.h" #include "Inputs/no-define.h" diff --git a/clang/test/Lexer/string-literal-errors.cpp b/clang/test/Lexer/string-literal-errors.cpp index 223dca4b94f62..c819ba435dbd2 100644 --- a/clang/test/Lexer/string-literal-errors.cpp +++ b/clang/test/Lexer/string-literal-errors.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck -strict-whitespace %s +// RUN: not %clang_cc1 -fsyntax-only -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck -strict-whitespace %s void foo() { (void)"\q \u123z \x \U \U123 \U12345 \u123 \xyzzy \777 \U" diff --git a/clang/test/Misc/caret-diags-macros.c b/clang/test/Misc/caret-diags-macros.c index e60ed389e656c..13b084b853acc 100644 --- a/clang/test/Misc/caret-diags-macros.c +++ b/clang/test/Misc/caret-diags-macros.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck %s -strict-whitespace +// RUN: %clang_cc1 -fsyntax-only -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck %s -strict-whitespace #define M1(x) x #define M2 1; diff --git a/clang/test/Misc/caret-diags-multiline.cpp b/clang/test/Misc/caret-diags-multiline.cpp index 90b50ff398ace..baf8e5a219be3 100644 --- a/clang/test/Misc/caret-diags-multiline.cpp +++ b/clang/test/Misc/caret-diags-multiline.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -std=c++11 -fcaret-diagnostics-max-lines 5 -Wsometimes-uninitialized %s 2>&1 | FileCheck %s --strict-whitespace +// RUN: not %clang_cc1 -std=c++11 -fno-diagnostics-show-line-numbers -fcaret-diagnostics-max-lines 5 -Wsometimes-uninitialized %s 2>&1 | FileCheck %s --strict-whitespace void line(int); diff --git a/clang/test/Misc/diag-macro-backtrace.c b/clang/test/Misc/diag-macro-backtrace.c index 8d7d5726ad82d..0eb7bbb9f3355 100644 --- a/clang/test/Misc/diag-macro-backtrace.c +++ b/clang/test/Misc/diag-macro-backtrace.c @@ -1,4 +1,4 @@ -// RUN: not %clang -fsyntax-only -fmacro-backtrace-limit=0 %s 2>&1 | FileCheck %s +// RUN: not %clang -fsyntax-only -fno-diagnostics-show-line-numbers -fmacro-backtrace-limit=0 %s 2>&1 | FileCheck %s #define FOO 1+"hi" #define BAR FOO diff --git a/clang/test/Misc/message-length.c b/clang/test/Misc/message-length.c index 1e0b4edb7c032..4e2818121a132 100644 --- a/clang/test/Misc/message-length.c +++ b/clang/test/Misc/message-length.c @@ -28,7 +28,7 @@ void a_very_long_line(int *ip, float *FloatPointer) { #pragma STDC CX_LIMITED_RANGE // some long comment text and a brace, eh {} // CHECK: FILE:23:78 -// CHECK: {{^ ...// some long comment text and a brace, eh {}}} +// CHECK: {{^ 23 | ...// some long comment text and a brace, eh {}}} struct A { int x; }; void h(struct A *a) { diff --git a/clang/test/Misc/tabstop.c b/clang/test/Misc/tabstop.c index 500bf2a834a31..834d1839a3d8b 100644 --- a/clang/test/Misc/tabstop.c +++ b/clang/test/Misc/tabstop.c @@ -1,7 +1,7 @@ -// RUN: %clang_cc1 -ftabstop 3 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-3 -strict-whitespace %s -// RUN: %clang_cc1 -ftabstop 4 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-4 -strict-whitespace %s -// RUN: %clang_cc1 -ftabstop 5 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-5 -strict-whitespace %s -// RUN: %clang_cc1 -ftabstop 101 -fsyntax-only -Wno-error=int-conversion %s 2>&1 | FileCheck -check-prefix=CHECK-101 -strict-whitespace %s +// RUN: %clang_cc1 -ftabstop 3 -fsyntax-only -Wno-error=int-conversion -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck -check-prefix=CHECK-3 -strict-whitespace %s +// RUN: %clang_cc1 -ftabstop 4 -fsyntax-only -Wno-error=int-conversion -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck -check-prefix=CHECK-4 -strict-whitespace %s +// RUN: %clang_cc1 -ftabstop 5 -fsyntax-only -Wno-error=int-conversion -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck -check-prefix=CHECK-5 -strict-whitespace %s +// RUN: %clang_cc1 -ftabstop 101 -fsyntax-only -Wno-error=int-conversion -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck -check-prefix=CHECK-101 -strict-whitespace %s // tab void* a = 1; diff --git a/clang/test/Misc/unnecessary-elipses.cpp b/clang/test/Misc/unnecessary-elipses.cpp index c8c178c37f6c6..b9f7cb7d3e00e 100644 --- a/clang/test/Misc/unnecessary-elipses.cpp +++ b/clang/test/Misc/unnecessary-elipses.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -fmessage-length=80 %s 2>&1 | FileCheck -strict-whitespace %s +// RUN: %clang_cc1 -fsyntax-only -fmessage-length=80 -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck -strict-whitespace %s int main() { "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; diff --git a/clang/test/Misc/unprintable.c b/clang/test/Misc/unprintable.c index 30e4494566300..02402cdfa6693 100644 --- a/clang/test/Misc/unprintable.c +++ b/clang/test/Misc/unprintable.c @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 %s -fmessage-length=40 2>&1 | FileCheck -strict-whitespace %s +// RUN: not %clang_cc1 %s -fmessage-length=40 -fno-diagnostics-show-line-numbers 2>&1 | FileCheck -strict-whitespace %s int main() { int i; diff --git a/clang/test/Misc/wrong-encoding.c b/clang/test/Misc/wrong-encoding.c index 4d8aa94ffffb2..98c1ecb9b5a66 100644 --- a/clang/test/Misc/wrong-encoding.c +++ b/clang/test/Misc/wrong-encoding.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck %s // REQUIRES: asserts void foo(void) { diff --git a/clang/test/Parser/brackets.c b/clang/test/Parser/brackets.c index a651c35d073a8..1821747d13515 100644 --- a/clang/test/Parser/brackets.c +++ b/clang/test/Parser/brackets.c @@ -2,7 +2,7 @@ // RUN: cp %s %t // RUN: not %clang_cc1 -fixit %t -x c -DFIXIT // RUN: %clang_cc1 -fsyntax-only %t -x c -DFIXIT -// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s -strict-whitespace +// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck %s -strict-whitespace void test1(void) { int a[] = {0,1,1,2,3}; diff --git a/clang/test/Parser/brackets.cpp b/clang/test/Parser/brackets.cpp index 415cbdda83f5f..40b08c37a06a1 100644 --- a/clang/test/Parser/brackets.cpp +++ b/clang/test/Parser/brackets.cpp @@ -2,7 +2,7 @@ // RUN: cp %s %t // RUN: not %clang_cc1 -fixit %t -x c++ -DFIXIT // RUN: %clang_cc1 -fsyntax-only %t -x c++ -DFIXIT -// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s -strict-whitespace +// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck %s -strict-whitespace void test1() { int a[] = {0,1,1,2,3}; diff --git a/clang/test/Preprocessor/ucn-pp-identifier.c b/clang/test/Preprocessor/ucn-pp-identifier.c index e1bcdfdcaa364..0cdff6e9db568 100644 --- a/clang/test/Preprocessor/ucn-pp-identifier.c +++ b/clang/test/Preprocessor/ucn-pp-identifier.c @@ -112,9 +112,9 @@ C 1 #define capital_u_\U00FC // expected-warning@-1 {{incomplete universal character name}} expected-note@-1 {{did you mean to use '\u'?}} expected-warning@-1 {{whitespace}} // CHECK: note: did you mean to use '\u'? -// CHECK-NEXT: #define capital_u_\U00FC -// CHECK-NEXT: {{^ \^}} -// CHECK-NEXT: {{^ u}} +// CHECK-NEXT: {{^ 112 | #define capital_u_\U00FC}} +// CHECK-NEXT: {{^ | \^}} +// CHECK-NEXT: {{^ | u}} #define \u{} // expected-warning {{empty delimited universal character name; treating as '\' 'u' '{' '}'}} expected-error {{macro name must be an identifier}} #define \u1{123} // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{macro name must be an identifier}} diff --git a/clang/test/Sema/caret-diags-complex-init.cpp b/clang/test/Sema/caret-diags-complex-init.cpp index d8a1b7837a640..83aab5c4e0b7d 100644 --- a/clang/test/Sema/caret-diags-complex-init.cpp +++ b/clang/test/Sema/caret-diags-complex-init.cpp @@ -1,4 +1,4 @@ -// RUN: not %clang_cc1 -std=c++11 -fsyntax-only -fcaret-diagnostics-max-lines 5 %s 2>&1 | FileCheck %s -strict-whitespace +// RUN: not %clang_cc1 -std=c++11 -fsyntax-only -fno-diagnostics-show-line-numbers -fcaret-diagnostics-max-lines 5 %s 2>&1 | FileCheck %s -strict-whitespace //CHECK: {{.*}}: error: excess elements in scalar initializer diff --git a/clang/test/SemaCXX/struct-class-redecl.cpp b/clang/test/SemaCXX/struct-class-redecl.cpp index 622d5a0b652a7..ab6488b237f55 100644 --- a/clang/test/SemaCXX/struct-class-redecl.cpp +++ b/clang/test/SemaCXX/struct-class-redecl.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fsyntax-only -Wmismatched-tags -verify %s -// RUN: not %clang_cc1 -fsyntax-only -Wmismatched-tags %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -fsyntax-only -Wmismatched-tags -fno-diagnostics-show-line-numbers -verify %s +// RUN: not %clang_cc1 -fsyntax-only -Wmismatched-tags -fno-diagnostics-show-line-numbers %s 2>&1 | FileCheck %s class X; // expected-note 2{{here}} typedef struct X * X_t; // expected-warning{{previously declared}} union X { int x; float y; }; // expected-error{{use of 'X' with tag type that does not match previous declaration}} diff --git a/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py b/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py index 72154ef4622e1..bf1fc8e4e0e0a 100644 --- a/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py +++ b/lldb/test/API/commands/expression/diagnostics/TestExprDiagnostics.py @@ -30,20 +30,38 @@ def test_source_and_caret_printing(self): self.assertFalse(value.GetError().Success()) # We should get a nice diagnostic with a caret pointing at the start of # the identifier. - self.assertIn("\nunknown_identifier\n^\n", value.GetError().GetCString()) + self.assertIn( + """ + 1 | unknown_identifier + | ^ +""", + value.GetError().GetCString(), + ) self.assertIn(":1:1", value.GetError().GetCString()) # Same as above but with the identifier in the middle. - value = frame.EvaluateExpression("1 + unknown_identifier ") + value = frame.EvaluateExpression("1 + unknown_identifier") self.assertFalse(value.GetError().Success()) - self.assertIn("\n1 + unknown_identifier", value.GetError().GetCString()) - self.assertIn("\n ^\n", value.GetError().GetCString()) + self.assertIn( + """ + 1 | 1 + unknown_identifier + | ^ +""", + value.GetError().GetCString(), + ) # Multiline expressions. value = frame.EvaluateExpression("int a = 0;\nfoobar +=1;\na") self.assertFalse(value.GetError().Success()) # We should still get the right line information and caret position. - self.assertIn("\nfoobar +=1;\n^\n", value.GetError().GetCString()) + self.assertIn( + """ + 2 | foobar +=1; + | ^ +""", + value.GetError().GetCString(), + ) + # It's the second line of the user expression. self.assertIn(":2:1", value.GetError().GetCString()) @@ -54,8 +72,13 @@ def test_source_and_caret_printing(self): value = frame.EvaluateExpression("void foo(unknown_type x) {}", top_level_opts) self.assertFalse(value.GetError().Success()) self.assertIn( - "\nvoid foo(unknown_type x) {}\n ^\n", value.GetError().GetCString() + """ + 1 | void foo(unknown_type x) {} + | ^ +""", + value.GetError().GetCString(), ) + # Top-level expressions might use a different wrapper code, but the file name should still # be the same. self.assertIn(":1:10", value.GetError().GetCString()) @@ -63,7 +86,14 @@ def test_source_and_caret_printing(self): # Multiline top-level expressions. value = frame.EvaluateExpression("void x() {}\nvoid foo;", top_level_opts) self.assertFalse(value.GetError().Success()) - self.assertIn("\nvoid foo;\n ^", value.GetError().GetCString()) + self.assertIn( + """ + 2 | void foo; + | ^ +""", + value.GetError().GetCString(), + ) + self.assertIn(":2:6", value.GetError().GetCString()) # Test that we render Clang's 'notes' correctly. @@ -72,7 +102,14 @@ def test_source_and_caret_printing(self): ) self.assertFalse(value.GetError().Success()) self.assertIn( - ":1:8: previous definition is here\nstruct SFoo{}; struct SFoo { int x; };\n ^\n", + ":1:8: previous definition is here\n", + value.GetError().GetCString(), + ) + self.assertIn( + """ + 1 | struct SFoo{}; struct SFoo { int x; }; + | ^ +""", value.GetError().GetCString(), ) @@ -82,14 +119,29 @@ def test_source_and_caret_printing(self): value = frame.EvaluateExpression("struct FooBar { double x };", top_level_opts) self.assertFalse(value.GetError().Success()) self.assertIn( - "error: :1:8: redefinition of 'FooBar'\nstruct FooBar { double x };\n ^\n", + "error: :1:8: redefinition of 'FooBar'\n", + value.GetError().GetCString(), + ) + self.assertIn( + """ + 1 | struct FooBar { double x }; + | ^ +""", value.GetError().GetCString(), ) value = frame.EvaluateExpression("foo(1, 2)") self.assertFalse(value.GetError().Success()) self.assertIn( - "error: :1:1: no matching function for call to 'foo'\nfoo(1, 2)\n^~~\nnote: candidate function not viable: requires single argument 'x', but 2 arguments were provided\n\n", + "error: :1:1: no matching function for call to 'foo'\n", + value.GetError().GetCString(), + ) + self.assertIn( + """ + 1 | foo(1, 2) + | ^~~ +note: candidate function not viable: requires single argument 'x', but 2 arguments were provided +""", value.GetError().GetCString(), ) @@ -99,7 +151,14 @@ def test_source_and_caret_printing(self): value = frame.EvaluateExpression("struct Redef { float y; };", top_level_opts) self.assertFalse(value.GetError().Success()) self.assertIn( - "error: :1:8: redefinition of 'Redef'\nstruct Redef { float y; };\n ^\n:1:8: previous definition is here\nstruct Redef { double x; };\n ^", + """ +error: :1:8: redefinition of 'Redef' + 1 | struct Redef { float y; }; + | ^ +:1:8: previous definition is here + 1 | struct Redef { double x; }; + | ^ +""", value.GetError().GetCString(), ) From a33099f0fef958bed6fc7a09c7f0df1310ba6cfc Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Wed, 31 May 2023 01:30:56 -0400 Subject: [PATCH 205/704] [mlir] Add the missing non-member operator decl in the same namesapce. NFC --- mlir/include/mlir/Interfaces/InferIntRangeInterface.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h index 7df4cc87c2e10..05064a72ef02e 100644 --- a/mlir/include/mlir/Interfaces/InferIntRangeInterface.h +++ b/mlir/include/mlir/Interfaces/InferIntRangeInterface.h @@ -103,6 +103,8 @@ class ConstantIntRanges { APInt uminVal, umaxVal, sminVal, smaxVal; }; +raw_ostream &operator<<(raw_ostream &, const ConstantIntRanges &); + /// The type of the `setResultRanges` callback provided to ops implementing /// InferIntRangeInterface. It should be called once for each integer result /// value and be passed the ConstantIntRanges corresponding to that value. From 0706a53a1b02a70ac3ab163a29c5a3ab5b4f18e8 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 30 May 2023 22:40:45 -0700 Subject: [PATCH 206/704] [NFC][sanitizer] Change ArrayRef constructor --- compiler-rt/lib/sanitizer_common/sanitizer_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h index d43e066ca1b9b..358878e81c948 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h @@ -1086,7 +1086,7 @@ class ArrayRef { ArrayRef(T *begin, T *end) : begin_(begin), end_(end) {} template - ArrayRef(const C &src) : begin_(src.begin()), end_(src.end()) {} + ArrayRef(const C &src) : begin_(src.data()), end_(src.data() + src.size()) {} const T *begin() const { return begin_; } const T *end() const { return end_; } From ac1df22315a55c799239090097b6d6e0e9a916d8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 30 May 2023 22:45:42 -0700 Subject: [PATCH 207/704] [RISCV] Merge emitDirectiveOptionArchPlus and emitDirectiveOptionArchMinus into a single interface. NFC Probably going to do some other refactors after this, but this one was easy and clearly reduces duplicate code. Reviewed By: StephenFan Differential Revision: https://reviews.llvm.org/D151771 --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 8 +++--- .../MCTargetDesc/RISCVTargetStreamer.cpp | 26 ++++++------------- .../RISCV/MCTargetDesc/RISCVTargetStreamer.h | 15 +++++------ 3 files changed, 18 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index e012907a95f08..6006a133ddae1 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2782,8 +2782,8 @@ bool RISCVAsmParser::parseDirectiveOption() { return Error(Loc, OutputErrMsg.str()); } - getTargetStreamer().emitDirectiveOptionArchPlus(Ext->Key, PrefixEmitted, - HasComma); + getTargetStreamer().emitDirectiveOptionArchPlusOrMinus( + Ext->Key, /*Enable*/ true, PrefixEmitted, HasComma); } else { // It is invalid to disable an extension that there are other enabled // extensions depend on it. @@ -2798,8 +2798,8 @@ bool RISCVAsmParser::parseDirectiveOption() { } clearFeatureBits(Ext->Value, Ext->Key); - getTargetStreamer().emitDirectiveOptionArchMinus( - Ext->Key, PrefixEmitted, HasComma); + getTargetStreamer().emitDirectiveOptionArchPlusOrMinus( + Ext->Key, /*Enable*/ false, PrefixEmitted, HasComma); } if (!HasComma) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp index 3c8c704ef54e7..28e7fc6157c3a 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp @@ -35,13 +35,10 @@ void RISCVTargetStreamer::emitDirectiveOptionRelax() {} void RISCVTargetStreamer::emitDirectiveOptionNoRelax() {} void RISCVTargetStreamer::emitDirectiveVariantCC(MCSymbol &Symbol) {} void RISCVTargetStreamer::emitDirectiveOptionArchFullArch(StringRef Value, - bool &hasDotOption) {} -void RISCVTargetStreamer::emitDirectiveOptionArchPlus(StringRef Value, - bool &hasDotOption, - bool EmitComma) {} -void RISCVTargetStreamer::emitDirectiveOptionArchMinus(StringRef Value, - bool &hasDotOption, - bool EmitComma) {} + bool &PrefixEmitted) { +} +void RISCVTargetStreamer::emitDirectiveOptionArchPlusOrMinus( + StringRef Value, bool Enable, bool &PrefixEmitted, bool EmitComma) {} void RISCVTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {} void RISCVTargetStreamer::finishAttributeSection() {} void RISCVTargetStreamer::emitTextAttribute(unsigned Attribute, @@ -147,18 +144,11 @@ void RISCVTargetAsmStreamer::emitDirectiveOptionArchFullArch( OS << Value; emitCommaOrNextLine(OS, false); } -void RISCVTargetAsmStreamer::emitDirectiveOptionArchPlus(StringRef Value, - bool &PrefixEmitted, - bool EmitComma) { - emitDirectiveOptionArchPrefix(OS, PrefixEmitted); - OS << "+" << Value; - emitCommaOrNextLine(OS, EmitComma); -} -void RISCVTargetAsmStreamer::emitDirectiveOptionArchMinus(StringRef Value, - bool &PrefixEmitted, - bool EmitComma) { + +void RISCVTargetAsmStreamer::emitDirectiveOptionArchPlusOrMinus( + StringRef Value, bool Enable, bool &PrefixEmitted, bool EmitComma) { emitDirectiveOptionArchPrefix(OS, PrefixEmitted); - OS << "-" << Value; + OS << (Enable ? "+" : "-") << Value; emitCommaOrNextLine(OS, EmitComma); } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h index 48da70503fcc8..4baed99364b00 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h @@ -36,11 +36,9 @@ class RISCVTargetStreamer : public MCTargetStreamer { virtual void emitDirectiveVariantCC(MCSymbol &Symbol); virtual void emitDirectiveOptionArchFullArch(StringRef Value, bool &PrefixEmitted); - virtual void emitDirectiveOptionArchPlus(StringRef Value, bool &PrefixEmitted, - bool EmitComma); - virtual void emitDirectiveOptionArchMinus(StringRef Value, - bool &PrefixEmitted, - bool EmitComma); + virtual void emitDirectiveOptionArchPlusOrMinus(StringRef Value, bool Enable, + bool &PrefixEmitted, + bool EmitComma); virtual void emitAttribute(unsigned Attribute, unsigned Value); virtual void finishAttributeSection(); virtual void emitTextAttribute(unsigned Attribute, StringRef String); @@ -76,10 +74,9 @@ class RISCVTargetAsmStreamer : public RISCVTargetStreamer { void emitDirectiveVariantCC(MCSymbol &Symbol) override; void emitDirectiveOptionArchFullArch(StringRef Value, bool &PrefixEmitted) override; - void emitDirectiveOptionArchPlus(StringRef Value, bool &PrefixEmitted, - bool EmitComma) override; - void emitDirectiveOptionArchMinus(StringRef Value, bool &PrefixEmitted, - bool EmitComma) override; + void emitDirectiveOptionArchPlusOrMinus(StringRef Value, bool Enable, + bool &PrefixEmitted, + bool EmitComma) override; }; } From 4de9936fe0e31ceb817db1cdfc5dd4af2d44e01e Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Wed, 31 May 2023 07:55:05 +0200 Subject: [PATCH 208/704] [libc++][CI] Escape quoting. This should fix the Clang CI. --- libcxx/utils/ci/buildkite-pipeline-clang.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/utils/ci/buildkite-pipeline-clang.yml b/libcxx/utils/ci/buildkite-pipeline-clang.yml index 323f4cf80b890..b951c3bbf2b22 100644 --- a/libcxx/utils/ci/buildkite-pipeline-clang.yml +++ b/libcxx/utils/ci/buildkite-pipeline-clang.yml @@ -23,7 +23,7 @@ steps: # We use Release here to avoid including debug information. Otherwise, the clang binary is very large, which # is problematic because we need to upload the artifacts for other jobs to use. This may seem like nothing, # but with the number of jobs we run daily, this can result in thousands of GB of network I/O. - - "cmake -S llvm -B build -G Ninja -DCMAKE_CXX_COMPILER_LAUNCHER="ccache" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DLLVM_ENABLE_PROJECTS=\"clang;compiler-rt\"" + - "cmake -S llvm -B build -G Ninja -DCMAKE_CXX_COMPILER_LAUNCHER=\"ccache\" -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DLLVM_ENABLE_PROJECTS=\"clang;compiler-rt\"" - "ninja -C build install-clang install-clang-resource-headers" - "ccache -s" - "tar -cJvf install.tar.xz install/" From f28ed7f695288eba2b0e22996de92df5f7ef6368 Mon Sep 17 00:00:00 2001 From: Enna1 Date: Wed, 31 May 2023 14:03:53 +0800 Subject: [PATCH 209/704] [NFC][HWASAN] make variables names for callback functions more consistent This patch makes the variables names for callback functions more consistent. Changes no functionality. Reviewed By: vitalybuka Differential Revision: https://reviews.llvm.org/D151605 --- .../Instrumentation/HWAddressSanitizer.cpp | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 2ad090b0a274a..28db47a190927 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -337,8 +337,6 @@ class HWAddressSanitizer { Module &M; const StackSafetyGlobalInfo *SSI; Triple TargetTriple; - FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset; - FunctionCallee HWAsanHandleVfork; /// This struct defines the shadow mapping using the rule: /// shadow = (mem >> Scale) + Offset. @@ -392,6 +390,9 @@ class HWAddressSanitizer { FunctionCallee HwasanMemoryAccessCallback[2][kNumberOfAccessSizes]; FunctionCallee HwasanMemoryAccessCallbackSized[2]; + FunctionCallee HwasanMemmove, HwasanMemcpy, HwasanMemset; + FunctionCallee HwasanHandleVfork; + FunctionCallee HwasanTagMemoryFunc; FunctionCallee HwasanGenerateTagFunc; FunctionCallee HwasanRecordFrameRecordFunc; @@ -625,25 +626,25 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); const std::string MatchAllStr = UseMatchAllCallback ? "_match_all" : ""; FunctionType *HwasanMemoryAccessCallbackSizedFnTy, - *HwasanMemoryAccessCallbackFnTy, *HWAsanMemTransferFnTy, - *HWAsanMemsetFnTy; + *HwasanMemoryAccessCallbackFnTy, *HwasanMemTransferFnTy, + *HwasanMemsetFnTy; if (UseMatchAllCallback) { HwasanMemoryAccessCallbackSizedFnTy = FunctionType::get(VoidTy, {IntptrTy, IntptrTy, Int8Ty}, false); HwasanMemoryAccessCallbackFnTy = FunctionType::get(VoidTy, {IntptrTy, Int8Ty}, false); - HWAsanMemTransferFnTy = FunctionType::get( + HwasanMemTransferFnTy = FunctionType::get( Int8PtrTy, {Int8PtrTy, Int8PtrTy, IntptrTy, Int8Ty}, false); - HWAsanMemsetFnTy = FunctionType::get( + HwasanMemsetFnTy = FunctionType::get( Int8PtrTy, {Int8PtrTy, Int32Ty, IntptrTy, Int8Ty}, false); } else { HwasanMemoryAccessCallbackSizedFnTy = FunctionType::get(VoidTy, {IntptrTy, IntptrTy}, false); HwasanMemoryAccessCallbackFnTy = FunctionType::get(VoidTy, {IntptrTy}, false); - HWAsanMemTransferFnTy = + HwasanMemTransferFnTy = FunctionType::get(Int8PtrTy, {Int8PtrTy, Int8PtrTy, IntptrTy}, false); - HWAsanMemsetFnTy = + HwasanMemsetFnTy = FunctionType::get(Int8PtrTy, {Int8PtrTy, Int32Ty, IntptrTy}, false); } @@ -670,12 +671,12 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) { ? std::string("") : ClMemoryAccessCallbackPrefix; - HWAsanMemmove = M.getOrInsertFunction( - MemIntrinCallbackPrefix + "memmove" + MatchAllStr, HWAsanMemTransferFnTy); - HWAsanMemcpy = M.getOrInsertFunction( - MemIntrinCallbackPrefix + "memcpy" + MatchAllStr, HWAsanMemTransferFnTy); - HWAsanMemset = M.getOrInsertFunction( - MemIntrinCallbackPrefix + "memset" + MatchAllStr, HWAsanMemsetFnTy); + HwasanMemmove = M.getOrInsertFunction( + MemIntrinCallbackPrefix + "memmove" + MatchAllStr, HwasanMemTransferFnTy); + HwasanMemcpy = M.getOrInsertFunction( + MemIntrinCallbackPrefix + "memcpy" + MatchAllStr, HwasanMemTransferFnTy); + HwasanMemset = M.getOrInsertFunction( + MemIntrinCallbackPrefix + "memset" + MatchAllStr, HwasanMemsetFnTy); HwasanTagMemoryFunc = M.getOrInsertFunction("__hwasan_tag_memory", VoidTy, Int8PtrTy, Int8Ty, IntptrTy); @@ -688,7 +689,7 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) { ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow", ArrayType::get(Int8Ty, 0)); - HWAsanHandleVfork = + HwasanHandleVfork = M.getOrInsertFunction("__hwasan_handle_vfork", VoidTy, IntptrTy); } @@ -959,14 +960,14 @@ void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { if (isa(MI)) { if (UseMatchAllCallback) { IRB.CreateCall( - isa(MI) ? HWAsanMemmove : HWAsanMemcpy, + isa(MI) ? HwasanMemmove : HwasanMemcpy, {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()), IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false), ConstantInt::get(Int8Ty, *MatchAllTag)}); } else { IRB.CreateCall( - isa(MI) ? HWAsanMemmove : HWAsanMemcpy, + isa(MI) ? HwasanMemmove : HwasanMemcpy, {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()), IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); @@ -974,14 +975,14 @@ void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { } else if (isa(MI)) { if (UseMatchAllCallback) { IRB.CreateCall( - HWAsanMemset, + HwasanMemset, {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false), IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false), ConstantInt::get(Int8Ty, *MatchAllTag)}); } else { IRB.CreateCall( - HWAsanMemset, + HwasanMemset, {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()), IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false), IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)}); @@ -1326,7 +1327,7 @@ bool HWAddressSanitizer::instrumentLandingPads( for (auto *LP : LandingPadVec) { IRBuilder<> IRB(LP->getNextNode()); IRB.CreateCall( - HWAsanHandleVfork, + HwasanHandleVfork, {readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp" : "sp")}); } From 5265ff253a87a229a01e88e467860d7f8ae6b2a0 Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Wed, 31 May 2023 14:13:48 +0800 Subject: [PATCH 210/704] [VLIWMachineScheduler] Disable default copy ctor and copy assignment operator for VLIWSchedBoundary class VLIWSchedBoundary manages resources such as dynamically allocated memory, it's generally a good practice to either implement a custom copy constructor or disable the default one. Reviewed By: JamesNagurne Differential Revision: https://reviews.llvm.org/D151699 --- llvm/include/llvm/CodeGen/VLIWMachineScheduler.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h index a39f04f6db6c3..50c88067bd1b5 100644 --- a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h +++ b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h @@ -151,6 +151,8 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy { Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name + ".P") {} ~VLIWSchedBoundary(); + VLIWSchedBoundary &operator=(const VLIWSchedBoundary &other) = delete; + VLIWSchedBoundary(const VLIWSchedBoundary &other) = delete; void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) { DAG = dag; From 35a0079238ce9fc36cdc8c6a2895eb5538bf7b4a Mon Sep 17 00:00:00 2001 From: Jianjian GUAN Date: Thu, 25 May 2023 14:24:22 +0800 Subject: [PATCH 211/704] [RISCV] Add Zvfhmin extension for clang. This patch adds the Zvfhmin extension for clang. Reviewed By: craig.topper, michaelmaitland Differential Revision: https://reviews.llvm.org/D150253 --- clang/include/clang/Basic/riscv_vector.td | 16 +++++++++-- .../clang/Support/RISCVVIntrinsicUtils.h | 3 ++- clang/lib/Sema/Sema.cpp | 5 ++-- clang/lib/Sema/SemaRISCVVectorLookup.cpp | 12 +++++++++ .../zvfhmin-error.c | 24 +++++++++++++++++ .../rvv-intrinsics-handcrafted/zvfhmin.c | 27 +++++++++++++++++++ clang/test/Sema/riscv-vector-float16-check.c | 2 +- clang/utils/TableGen/RISCVVEmitter.cpp | 1 + llvm/lib/Support/RISCVISAInfo.cpp | 3 +++ llvm/lib/Target/RISCV/RISCVFeatures.td | 5 ++++ 10 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin-error.c create mode 100644 clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin.c diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td index 4d48c38adb578..35b1536f72d3b 100644 --- a/clang/include/clang/Basic/riscv_vector.td +++ b/clang/include/clang/Basic/riscv_vector.td @@ -2215,7 +2215,13 @@ let Log2LMUL = [-3, -2, -1, 0, 1, 2] in { def vfwcvt_rtz_x_f_v : RVVConvToWidenSignedBuiltin<"vfwcvt_rtz_x">; def vfwcvt_f_xu_v : RVVConvBuiltin<"Fw", "FwUv", "csi", "vfwcvt_f">; def vfwcvt_f_x_v : RVVConvBuiltin<"Fw", "Fwv", "csi", "vfwcvt_f">; - def vfwcvt_f_f_v : RVVConvBuiltin<"w", "wv", "xf", "vfwcvt_f">; + def vfwcvt_f_f_v : RVVConvBuiltin<"w", "wv", "f", "vfwcvt_f">; + let RequiredFeatures = ["ZvfhminOrZvfh"] in + def vfwcvt_f_f_v_fp16 : RVVConvBuiltin<"w", "wv", "x", "vfwcvt_f"> { + let Name = "vfwcvt_f_f_v"; + let IRName = "vfwcvt_f_f_v"; + let MaskedIRName = "vfwcvt_f_f_v_mask"; + } } // 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions @@ -2226,7 +2232,13 @@ let Log2LMUL = [-3, -2, -1, 0, 1, 2] in { def vfncvt_rtz_x_f_w : RVVConvToNarrowingSignedBuiltin<"vfncvt_rtz_x">; def vfncvt_f_xu_w : RVVConvBuiltin<"Fv", "FvUw", "csi", "vfncvt_f">; def vfncvt_f_x_w : RVVConvBuiltin<"Fv", "Fvw", "csi", "vfncvt_f">; - def vfncvt_f_f_w : RVVConvBuiltin<"v", "vw", "xf", "vfncvt_f">; + def vfncvt_f_f_w : RVVConvBuiltin<"v", "vw", "f", "vfncvt_f">; + let RequiredFeatures = ["ZvfhminOrZvfh"] in + def vfncvt_f_f_w_fp16 : RVVConvBuiltin<"v", "vw", "x", "vfncvt_f"> { + let Name = "vfncvt_f_f_w"; + let IRName = "vfncvt_f_f_w"; + let MaskedIRName = "vfncvt_f_f_w_mask"; + } def vfncvt_rod_f_f_w : RVVConvBuiltin<"v", "vw", "xf", "vfncvt_rod_f">; } } diff --git a/clang/include/clang/Support/RISCVVIntrinsicUtils.h b/clang/include/clang/Support/RISCVVIntrinsicUtils.h index 2a81e7972358e..7f2b5d9c28c48 100644 --- a/clang/include/clang/Support/RISCVVIntrinsicUtils.h +++ b/clang/include/clang/Support/RISCVVIntrinsicUtils.h @@ -470,7 +470,8 @@ enum RVVRequire : uint8_t { RVV_REQ_None = 0, RVV_REQ_RV64 = 1 << 0, RVV_REQ_FullMultiply = 1 << 1, - RVV_REQ_Xsfvcp = 1 << 2, + RVV_REQ_ZvfhminOrZvfh = 1 << 2, + RVV_REQ_Xsfvcp = 1 << 3, LLVM_MARK_AS_BITMASK_ENUM(RVV_REQ_Xsfvcp) }; diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 9c6db547dbefd..7cc0d472fca02 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -2044,9 +2044,10 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) { !TI.hasFeature("zve64x")) Diag(Loc, diag::err_riscv_type_requires_extension, FD) << Ty << "zve64x"; if (Ty->isRVVType(/* Bitwidth */ 16, /* IsFloat */ true) && - !TI.hasFeature("experimental-zvfh")) + !TI.hasFeature("experimental-zvfh") && + !TI.hasFeature("experimental-zvfhmin")) Diag(Loc, diag::err_riscv_type_requires_extension, FD) - << Ty << "zvfh"; + << Ty << "zvfh or zvfhmin"; if (Ty->isRVVType(/* Bitwidth */ 32, /* IsFloat */ true) && !TI.hasFeature("zve32f")) Diag(Loc, diag::err_riscv_type_requires_extension, FD) << Ty << "zve32f"; diff --git a/clang/lib/Sema/SemaRISCVVectorLookup.cpp b/clang/lib/Sema/SemaRISCVVectorLookup.cpp index 5599b9277cd79..be955a139c641 100644 --- a/clang/lib/Sema/SemaRISCVVectorLookup.cpp +++ b/clang/lib/Sema/SemaRISCVVectorLookup.cpp @@ -195,6 +195,8 @@ void RISCVIntrinsicManagerImpl::InitIntrinsicList() { const TargetInfo &TI = Context.getTargetInfo(); bool HasRV64 = TI.hasFeature("64bit"); bool HasFullMultiply = TI.hasFeature("v"); + bool HasZvfh = TI.hasFeature("experimental-zvfh"); + bool HasZvfhminOrZvfh = TI.hasFeature("experimental-zvfhmin") || HasZvfh; auto ConstructRVVIntrinsics = [&](ArrayRef Recs, IntrinsicKind K) { @@ -257,6 +259,16 @@ void RISCVIntrinsicManagerImpl::InitIntrinsicList() { !HasFullMultiply) continue; + if (BaseType == BasicType::Float16) { + if ((Record.RequiredExtensions & RVV_REQ_ZvfhminOrZvfh) == + RVV_REQ_ZvfhminOrZvfh) { + if (!HasZvfhminOrZvfh) + continue; + } else if (!HasZvfh) { + continue; + } + } + // Expanded with different LMUL. for (int Log2LMUL = -3; Log2LMUL <= 3; Log2LMUL++) { if (!(Record.Log2LMULMask & (1 << (Log2LMUL + 3)))) diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin-error.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin-error.c new file mode 100644 index 0000000000000..bd96a3d66457f --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin-error.c @@ -0,0 +1,24 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +v \ +// RUN: -target-feature +experimental-zvfh -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-ZVF %s + +// RUN: not %clang_cc1 -triple riscv64 -target-feature +v \ +// RUN: -target-feature +experimental-zvfhmin -emit-llvm-only %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-ZVFHMIN-ERR + +#include + +// CHECK-ZVF-LABEL: @test_vfadd_vv_f16m1( +// CHECK-ZVF-NEXT: entry: +// CHECK-ZVF-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vfadd.nxv4f16.nxv4f16.i64( poison, [[OP1:%.*]], [[OP2:%.*]], i64 [[VL:%.*]]) +// CHECK-ZVF-NEXT: ret [[TMP0]] +// + +// CHECK-ZVFHMIN-ERR: no matching function for call to '__riscv_vfadd' + +vfloat16m1_t test_vfadd_vv_f16m1(vfloat16m1_t op1, vfloat16m1_t op2, size_t vl) { + return __riscv_vfadd(op1, op2, vl); +} diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin.c new file mode 100644 index 0000000000000..c000d859a5639 --- /dev/null +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/zvfhmin.c @@ -0,0 +1,27 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: riscv-registered-target +// RUN: %clang_cc1 -triple riscv64 -target-feature +v \ +// RUN: -target-feature +experimental-zvfhmin -disable-O0-optnone \ +// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | \ +// RUN: FileCheck --check-prefix=CHECK-ZVFHMIN %s + +#include + +// CHECK-ZVFHMIN-LABEL: @test_vfncvt_f_f_w_f16m1( +// CHECK-ZVFHMIN-NEXT: entry: +// CHECK-ZVFHMIN-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vfncvt.f.f.w.nxv4f16.nxv4f32.i64( poison, [[SRC:%.*]], i64 [[VL:%.*]]) +// CHECK-ZVFHMIN-NEXT: ret [[TMP0]] +// +vfloat16m1_t test_vfncvt_f_f_w_f16m1(vfloat32m2_t src, size_t vl) { + return __riscv_vfncvt_f(src, vl); +} + + +// CHECK-ZVFHMIN-LABEL: @test_vfwcvt_f_f_v_f16m1( +// CHECK-ZVFHMIN-NEXT: entry: +// CHECK-ZVFHMIN-NEXT: [[TMP0:%.*]] = call @llvm.riscv.vfwcvt.f.f.v.nxv4f32.nxv4f16.i64( poison, [[SRC:%.*]], i64 [[VL:%.*]]) +// CHECK-ZVFHMIN-NEXT: ret [[TMP0]] +// +vfloat32m2_t test_vfwcvt_f_f_v_f16m1(vfloat16m1_t src, size_t vl) { + return __riscv_vfwcvt_f(src, vl); +} diff --git a/clang/test/Sema/riscv-vector-float16-check.c b/clang/test/Sema/riscv-vector-float16-check.c index 48959254d2ad4..57e087ba68ec4 100644 --- a/clang/test/Sema/riscv-vector-float16-check.c +++ b/clang/test/Sema/riscv-vector-float16-check.c @@ -4,5 +4,5 @@ // REQUIRES: riscv-registered-target #include -vfloat16m1_t foo() { /* expected-error {{RISC-V type 'vfloat16m1_t' (aka '__rvv_float16m1_t') requires the 'zvfh' extension}} */ +vfloat16m1_t foo() { /* expected-error {{RISC-V type 'vfloat16m1_t' (aka '__rvv_float16m1_t') requires the 'zvfh or zvfhmin' extension}} */ } /* expected-warning {{non-void function does not return a value}}*/ diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp index 35b2203cecf34..cc64b5e4d8aa1 100644 --- a/clang/utils/TableGen/RISCVVEmitter.cpp +++ b/clang/utils/TableGen/RISCVVEmitter.cpp @@ -633,6 +633,7 @@ void RVVEmitter::createRVVIntrinsics( RVVRequire RequireExt = StringSwitch(RequiredFeature) .Case("RV64", RVV_REQ_RV64) .Case("FullMultiply", RVV_REQ_FullMultiply) + .Case("ZvfhminOrZvfh", RVV_REQ_ZvfhminOrZvfh) .Case("Xsfvcp", RVV_REQ_Xsfvcp) .Default(RVV_REQ_None); assert(RequireExt != RVV_REQ_None && "Unrecognized required feature?"); diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp index c5b42840b6881..444cf14d8fcb0 100644 --- a/llvm/lib/Support/RISCVISAInfo.cpp +++ b/llvm/lib/Support/RISCVISAInfo.cpp @@ -152,6 +152,7 @@ static const RISCVSupportedExtension SupportedExperimentalExtensions[] = { {"zvfbfmin", RISCVExtensionVersion{0, 6}}, {"zvfbfwma", RISCVExtensionVersion{0, 6}}, {"zvfh", RISCVExtensionVersion{0, 1}}, + {"zvfhmin", RISCVExtensionVersion{0, 1}}, {"ztso", RISCVExtensionVersion{0, 1}}, // vector crypto @@ -943,6 +944,7 @@ static const char *ImpliedExtsZve64x[] = {"zve32x", "zvl64b"}; static const char *ImpliedExtsZvfbfmin[] = {"zve32f"}; static const char *ImpliedExtsZvfbfwma[] = {"zve32f"}; static const char *ImpliedExtsZvfh[] = {"zve32f", "zfhmin"}; +static const char *ImpliedExtsZvfhmin[] = {"zve32f"}; static const char *ImpliedExtsZvkn[] = {"zvbb", "zvbc", "zvkned", "zvknhb", "zvkt"}; static const char *ImpliedExtsZvkng[] = {"zvkg", "zvkn"}; @@ -1004,6 +1006,7 @@ static constexpr ImpliedExtsEntry ImpliedExts[] = { {{"zvfbfmin"}, {ImpliedExtsZvfbfmin}}, {{"zvfbfwma"}, {ImpliedExtsZvfbfwma}}, {{"zvfh"}, {ImpliedExtsZvfh}}, + {{"zvfhmin"}, {ImpliedExtsZvfhmin}}, {{"zvkn"}, {ImpliedExtsZvkn}}, {{"zvkng"}, {ImpliedExtsZvkng}}, {{"zvknhb"}, {ImpliedExtsZvknhb}}, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index d5213a07ea1ca..c7ce850529af0 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -472,6 +472,11 @@ def FeatureStdExtZvfh "'Zvfh' (Vector Half-Precision Floating-Point)", [FeatureStdExtZve32f, FeatureStdExtZfhmin]>; +def FeatureStdExtZvfhmin + : SubtargetFeature<"experimental-zvfhmin", "HasStdExtZvfhmin", "true", + "'Zvfhmin' (Vector Half-Precision Floating-Point Minimal)", + [FeatureStdExtZve32f]>; + def HasVInstructionsF16 : Predicate<"Subtarget->hasVInstructionsF16()">; def HasStdExtZfhOrZvfh From 8e9baa9668e0d8da524ed1c98c2f5f5b0185cf18 Mon Sep 17 00:00:00 2001 From: Tobias Gysi Date: Wed, 31 May 2023 06:48:49 +0000 Subject: [PATCH 212/704] [mlir][llvm] Add alias and access group info to call. The revision adds the alias analysis and access group interfaces to the call operation. Reviewed By: Dinistro Differential Revision: https://reviews.llvm.org/D151693 --- mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 32 +++++++++---------- mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 15 ++++++--- .../LLVMIR/LLVMToLLVMIRTranslation.cpp | 3 ++ .../LLVMIR/Import/metadata-alias-scopes.ll | 5 +++ .../Target/LLVMIR/Import/metadata-loop.ll | 3 ++ .../Target/LLVMIR/Import/metadata-tbaa.ll | 3 ++ mlir/test/Target/LLVMIR/llvmir.mlir | 6 ++++ mlir/test/Target/LLVMIR/loop-metadata.mlir | 4 +++ mlir/test/Target/LLVMIR/tbaa.mlir | 3 ++ 9 files changed, 53 insertions(+), 21 deletions(-) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 3218701cd5c3e..53418cfec0735 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -172,7 +172,7 @@ def LLVM_FNegOp : LLVM_UnaryFloatArithmeticOp< LLVM_ScalarOrVectorOf, "fneg", "FNeg">; // Memory-related operations. -def LLVM_AllocaOp : LLVM_Op<"alloca", +def LLVM_AllocaOp : LLVM_Op<"alloca", [DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]>, LLVM_MemOpPatterns { @@ -328,7 +328,7 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load", DefaultValuedAttr< AtomicOrdering, "AtomicOrdering::not_atomic">:$ordering, OptionalAttr:$syncscope); - // Append the aliasing related attributes define in LLVM_MemAccessOpBase. + // Append the aliasing related attributes defined in LLVM_MemAccessOpBase. let arguments = !con(args, aliasAttrs); let results = (outs LLVM_LoadableType:$res); string llvmInstName = "Load"; @@ -402,7 +402,7 @@ def LLVM_StoreOp : LLVM_MemAccessOpBase<"store", DefaultValuedAttr< AtomicOrdering, "AtomicOrdering::not_atomic">:$ordering, OptionalAttr:$syncscope); - // Append the aliasing related attributes define in LLVM_MemAccessOpBase. + // Append the aliasing related attributes defined in LLVM_MemAccessOpBase. let arguments = !con(args, aliasAttrs); string llvmInstName = "Store"; let description = [{ @@ -568,10 +568,10 @@ def LLVM_LandingpadOp : LLVM_Op<"landingpad"> { // FIXME: Add a type attribute that carries the LLVM function type to support // indirect calls to variadic functions. The type attribute is necessary to // distinguish normal and variadic arguments. -def LLVM_CallOp : LLVM_Op<"call", - [DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods]> { +def LLVM_CallOp : LLVM_MemAccessOpBase<"call", + [DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods]> { let summary = "Call to an LLVM function."; let description = [{ In LLVM IR, functions may return either 0 or 1 value. LLVM IR dialect @@ -601,13 +601,14 @@ def LLVM_CallOp : LLVM_Op<"call", ``` }]; - let arguments = (ins OptionalAttr:$callee, - Variadic, - DefaultValuedAttr:$fastmathFlags, - OptionalAttr:$branch_weights); + dag args = (ins OptionalAttr:$callee, + Variadic, + DefaultValuedAttr:$fastmathFlags, + OptionalAttr:$branch_weights); + // Append the aliasing related attributes defined in LLVM_MemAccessOpBase. + let arguments = !con(args, aliasAttrs); let results = (outs Optional:$result); - let builders = [ OpBuilder<(ins "LLVMFuncOp":$func, "ValueRange":$args)>, OpBuilder<(ins "TypeRange":$results, "StringAttr":$callee, @@ -617,7 +618,6 @@ def LLVM_CallOp : LLVM_Op<"call", OpBuilder<(ins "TypeRange":$results, "StringRef":$callee, CArg<"ValueRange", "{}">:$args)> ]; - let hasCustomAssemblyFormat = 1; } @@ -1778,7 +1778,7 @@ def LLVM_AtomicRMWOp : LLVM_MemAccessOpBase<"atomicrmw", [ OptionalAttr:$syncscope, OptionalAttr:$alignment, UnitAttr:$volatile_); - // Append the aliasing related attributes define in LLVM_MemAccessOpBase. + // Append the aliasing related attributes defined in LLVM_MemAccessOpBase. let arguments = !con(args, aliasAttrs); let results = (outs LLVM_AtomicRMWType:$res); let assemblyFormat = [{ @@ -1832,7 +1832,7 @@ def LLVM_AtomicCmpXchgOp : LLVM_MemAccessOpBase<"cmpxchg", [ OptionalAttr:$alignment, UnitAttr:$weak, UnitAttr:$volatile_); - // Append the aliasing related attributes define in LLVM_MemAccessOpBase. + // Append the aliasing related attributes defined in LLVM_MemAccessOpBase. let arguments = !con(args, aliasAttrs); let results = (outs LLVM_AnyStruct:$res); let assemblyFormat = [{ diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index a31daaab17e1d..c44c60a0bc176 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -916,13 +916,15 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results, void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results, StringAttr callee, ValueRange args) { - build(builder, state, results, SymbolRefAttr::get(callee), args, nullptr, - nullptr); + build(builder, state, results, SymbolRefAttr::get(callee), args); } void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results, FlatSymbolRefAttr callee, ValueRange args) { - build(builder, state, results, callee, args, nullptr, nullptr); + build(builder, state, results, callee, args, /*fastmathFlags=*/nullptr, + /*branch_weights=*/nullptr, + /*access_groups=*/nullptr, /*alias_scopes=*/nullptr, + /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); } void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func, @@ -931,8 +933,11 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func, Type resultType = func.getFunctionType().getReturnType(); if (!llvm::isa(resultType)) results.push_back(resultType); - build(builder, state, results, SymbolRefAttr::get(func), args, nullptr, - nullptr); + build(builder, state, results, SymbolRefAttr::get(func), args, + /*fastmathFlags=*/nullptr, + /*branch_weights=*/nullptr, + /*access_groups=*/nullptr, /*alias_scopes=*/nullptr, + /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); } CallInterfaceCallable CallOp::getCallableForCallee() { diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp index c5a48005cd400..a044930a0cf8b 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp @@ -186,6 +186,9 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder, convertBranchWeights(callOp.getBranchWeights(), moduleTranslation); if (branchWeights) call->setMetadata(llvm::LLVMContext::MD_prof, branchWeights); + moduleTranslation.setAccessGroupsMetadata(callOp, call); + moduleTranslation.setAliasScopeMetadata(callOp, call); + moduleTranslation.setTBAAMetadata(callOp, call); // If the called function has a result, remap the corresponding value. Note // that LLVM IR dialect CallOp has either 0 or 1 result. if (opInst.getNumResults() != 0) { diff --git a/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll b/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll index eb74b0ab880bb..19abc95a4a268 100644 --- a/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll +++ b/mlir/test/Target/LLVMIR/Import/metadata-alias-scopes.ll @@ -83,12 +83,17 @@ define void @supported_ops(ptr %arg1, float %arg2, i32 %arg3, i32 %arg4) { call void @llvm.memcpy.p0.p0.i32(ptr %arg1, ptr %arg1, i32 4, i1 false), !alias.scope !2 ; CHECK: "llvm.intr.memset"{{.*}}alias_scopes = [@__llvm_global_metadata::@[[$SCOPE]]] call void @llvm.memset.p0.i32(ptr %arg1, i8 42, i32 4, i1 false), !alias.scope !2 + ; CHECK: llvm.call{{.*}}alias_scopes = [@__llvm_global_metadata::@[[$SCOPE]]] + call void @foo(ptr %arg1), !alias.scope !2 + ; CHECK: llvm.call{{.*}}noalias_scopes = [@__llvm_global_metadata::@[[$SCOPE]]] + call void @foo(ptr %arg1), !noalias !2 ret void } declare void @llvm.experimental.noalias.scope.decl(metadata) declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg) +declare void @foo(ptr %arg1) !0 = distinct !{!0, !"The domain"} !1 = !{!1, !0} diff --git a/mlir/test/Target/LLVMIR/Import/metadata-loop.ll b/mlir/test/Target/LLVMIR/Import/metadata-loop.ll index dcd2004040abb..9638ebd0dc197 100644 --- a/mlir/test/Target/LLVMIR/Import/metadata-loop.ll +++ b/mlir/test/Target/LLVMIR/Import/metadata-loop.ll @@ -41,11 +41,14 @@ define void @supported_ops(ptr %arg1, float %arg2, i32 %arg3, i32 %arg4) { call void @llvm.memcpy.p0.p0.i32(ptr %arg1, ptr %arg1, i32 4, i1 false), !llvm.access.group !0 ; CHECK: "llvm.intr.memset"{{.*}}access_groups = call void @llvm.memset.p0.i32(ptr %arg1, i8 42, i32 4, i1 false), !llvm.access.group !0 + ; CHECK: llvm.call{{.*}}access_groups = + call void @foo(ptr %arg1), !llvm.access.group !0 ret void } declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg) +declare void @foo(ptr %arg1) !0 = !{!1, !2} !1 = distinct !{} diff --git a/mlir/test/Target/LLVMIR/Import/metadata-tbaa.ll b/mlir/test/Target/LLVMIR/Import/metadata-tbaa.ll index 2aa1e94a0e2a4..9477063c85798 100644 --- a/mlir/test/Target/LLVMIR/Import/metadata-tbaa.ll +++ b/mlir/test/Target/LLVMIR/Import/metadata-tbaa.ll @@ -85,11 +85,14 @@ define void @supported_ops(ptr %arg1, float %arg2, i32 %arg3, i32 %arg4) { call void @llvm.memcpy.p0.p0.i32(ptr %arg1, ptr %arg1, i32 4, i1 false), !tbaa !0 ; CHECK: "llvm.intr.memset"{{.*}}tbaa = call void @llvm.memset.p0.i32(ptr %arg1, i8 42, i32 4, i1 false), !tbaa !0 + ; CHECK: llvm.call{{.*}}tbaa = + call void @foo(ptr %arg1), !tbaa !0 ret void } declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg) +declare void @foo(ptr %arg1) !0 = !{!1, !1, i64 0} !1 = !{!"scalar type", !2, i64 0} diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index 1c1581c6c6705..ed4237fc8c1d7 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -2019,6 +2019,8 @@ llvm.func @switch_weights(%arg0: i32) -> i32 { // ----- +llvm.func @foo(%arg0: !llvm.ptr) + // CHECK-LABEL: aliasScope llvm.func @aliasScope(%arg1 : !llvm.ptr) { %0 = llvm.mlir.constant(0 : i32) : i32 @@ -2038,6 +2040,10 @@ llvm.func @aliasScope(%arg1 : !llvm.ptr) { "llvm.intr.memcpy"(%arg1, %arg1, %0, %4) {alias_scopes = [@metadata::@scope3]} : (!llvm.ptr, !llvm.ptr, i32, i1) -> () // CHECK: llvm.memset{{.*}}, !noalias ![[SCOPES3]] "llvm.intr.memset"(%arg1, %5, %0, %4) {noalias_scopes = [@metadata::@scope3]} : (!llvm.ptr, i8, i32, i1) -> () + // CHECK: call void @foo({{.*}} !alias.scope ![[SCOPES3]] + llvm.call @foo(%arg1) {alias_scopes = [@metadata::@scope3]} : (!llvm.ptr) -> () + // CHECK: call void @foo({{.*}} !noalias ![[SCOPES3]] + llvm.call @foo(%arg1) {noalias_scopes = [@metadata::@scope3]} : (!llvm.ptr) -> () llvm.return } diff --git a/mlir/test/Target/LLVMIR/loop-metadata.mlir b/mlir/test/Target/LLVMIR/loop-metadata.mlir index f17cc003aa0ae..de480726190b7 100644 --- a/mlir/test/Target/LLVMIR/loop-metadata.mlir +++ b/mlir/test/Target/LLVMIR/loop-metadata.mlir @@ -233,6 +233,8 @@ llvm.func @unswitchOptions() { // ----- +llvm.func @foo(%arg0: i32) + // CHECK-LABEL: @loopOptions llvm.func @loopOptions(%arg1 : i32, %arg2 : i32) { %0 = llvm.mlir.constant(0 : i32) : i32 @@ -262,6 +264,8 @@ llvm.func @loopOptions(%arg1 : i32, %arg2 : i32) { "llvm.intr.memcpy"(%4, %4, %0, %8) {access_groups = [@metadata::@group1, @metadata::@group2]} : (!llvm.ptr, !llvm.ptr, i32, i1) -> () // CHECK: llvm.memset{{.*}} !llvm.access.group ![[ACCESS_GROUPS_NODE]] "llvm.intr.memset"(%4, %9, %0, %8) {access_groups = [@metadata::@group1, @metadata::@group2]} : (!llvm.ptr, i8, i32, i1) -> () + // CHECK: call void @foo({{.*}} !llvm.access.group ![[ACCESS_GROUPS_NODE]] + llvm.call @foo(%arg1) {access_groups = [@metadata::@group1, @metadata::@group2]} : (i32) -> () // CHECK: br label {{.*}} !llvm.loop ![[LOOP_NODE]] llvm.br ^bb3(%3 : i32) {loop_annotation = #llvm.loop_annotation< licm = , diff --git a/mlir/test/Target/LLVMIR/tbaa.mlir b/mlir/test/Target/LLVMIR/tbaa.mlir index 1c9b2be86851f..ade6c4acb0df7 100644 --- a/mlir/test/Target/LLVMIR/tbaa.mlir +++ b/mlir/test/Target/LLVMIR/tbaa.mlir @@ -55,6 +55,7 @@ module { llvm.tbaa_type_desc @tbaa_type_desc_6 {id = "agg1_t", members = {<@tbaa_type_desc_5, 0>, <@tbaa_type_desc_5, 4>}} llvm.tbaa_tag @tbaa_tag_7 {access_type = @tbaa_type_desc_5, base_type = @tbaa_type_desc_6, offset = 0 : i64} } + llvm.func @foo(%arg0: !llvm.ptr) llvm.func @tbaa2(%arg0: !llvm.ptr, %arg1: !llvm.ptr) { %0 = llvm.mlir.constant(0 : i32) : i32 %1 = llvm.mlir.constant(1 : i32) : i32 @@ -75,6 +76,8 @@ module { "llvm.intr.memcpy"(%arg1, %arg1, %0, %8) {tbaa = [@__tbaa::@tbaa_tag_7]} : (!llvm.ptr, !llvm.ptr, i32, i1) -> () // CHECK: llvm.memset{{.*}} !tbaa ![[STAG]] "llvm.intr.memset"(%arg1, %9, %0, %8) {tbaa = [@__tbaa::@tbaa_tag_7]} : (!llvm.ptr, i8, i32, i1) -> () + // CHECK: call void @foo({{.*}} !tbaa ![[STAG]] + llvm.call @foo(%arg1) {tbaa = [@__tbaa::@tbaa_tag_7]} : (!llvm.ptr) -> () llvm.return } } From 39f4bd214f1be248283fb7e35bc2610c19169252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 31 May 2023 08:50:38 +0200 Subject: [PATCH 213/704] [clang][Interp][NFC] Simplify dump() indentation Use llvm::raw_ostream::indent(). --- clang/lib/AST/Interp/Disasm.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp index 7a5da90cd9002..f4a6cb85470fb 100644 --- a/clang/lib/AST/Interp/Disasm.cpp +++ b/clang/lib/AST/Interp/Disasm.cpp @@ -42,9 +42,7 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const { auto PrintName = [&OS](const char *Name) { OS << Name; - for (long I = 0, N = strlen(Name); I < 30 - N; ++I) { - OS << ' '; - } + OS.indent(std::max(30l - strlen(Name), 0ul)); }; for (CodePtr Start = getCodeBegin(), PC = Start; PC != getCodeEnd();) { From ddff70cb4b0017489d98bf301a5bcc4f9fa4afab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 31 May 2023 08:56:19 +0200 Subject: [PATCH 214/704] [clang][Interp][NFC] Make InterpFrame::describe() more const-correct --- clang/lib/AST/Interp/InterpFrame.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp index 6acfbd3fa6143..e20f283c28558 100644 --- a/clang/lib/AST/Interp/InterpFrame.cpp +++ b/clang/lib/AST/Interp/InterpFrame.cpp @@ -98,20 +98,19 @@ void print(llvm::raw_ostream &OS, const Pointer &P, ASTContext &Ctx, return; } - auto printDesc = [&OS, &Ctx](Descriptor *Desc) { - if (auto *D = Desc->asDecl()) { + auto printDesc = [&OS, &Ctx](const Descriptor *Desc) { + if (const auto *D = Desc->asDecl()) { // Subfields or named values. - if (auto *VD = dyn_cast(D)) { + if (const auto *VD = dyn_cast(D)) { OS << *VD; return; } // Base classes. - if (isa(D)) { + if (isa(D)) return; - } } // Temporary expression. - if (auto *E = Desc->asExpr()) { + if (const auto *E = Desc->asExpr()) { E->printPretty(OS, nullptr, Ctx.getPrintingPolicy()); return; } From a0ea9f63c25b1cd4cb7747ea611596bb5e2db8a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 31 May 2023 09:13:19 +0200 Subject: [PATCH 215/704] [clang] Fix 39f4bd214f1be248283fb7e35bc2610c19169252 on win builders --- clang/lib/AST/Interp/Disasm.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp index f4a6cb85470fb..35ed5d1286971 100644 --- a/clang/lib/AST/Interp/Disasm.cpp +++ b/clang/lib/AST/Interp/Disasm.cpp @@ -42,7 +42,9 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const { auto PrintName = [&OS](const char *Name) { OS << Name; - OS.indent(std::max(30l - strlen(Name), 0ul)); + long N = 30 - strlen(Name); + if (N > 0) + OS.indent(N); }; for (CodePtr Start = getCodeBegin(), PC = Start; PC != getCodeEnd();) { From 238f15820e71080b0bbc7d6ee95303fdb430d6fa Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Wed, 31 May 2023 10:16:51 +0300 Subject: [PATCH 216/704] [clang] Add test for CWG873 Also add missing marking to the test of related issue 621. https://cplusplus.github.io/CWG/issues/621.html https://cplusplus.github.io/CWG/issues/873.html Reviewed By: #clang-language-wg, shafik Differential Revision: https://reviews.llvm.org/D151704 --- clang/test/CXX/drs/dr6xx.cpp | 2 +- clang/test/CXX/drs/dr8xx.cpp | 30 ++++++++++++++++++++++++++++++ clang/www/cxx_dr_status.html | 4 ++-- 3 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 clang/test/CXX/drs/dr8xx.cpp diff --git a/clang/test/CXX/drs/dr6xx.cpp b/clang/test/CXX/drs/dr6xx.cpp index 59331e7de7687..a08f55dd562a4 100644 --- a/clang/test/CXX/drs/dr6xx.cpp +++ b/clang/test/CXX/drs/dr6xx.cpp @@ -234,7 +234,7 @@ namespace dr619 { // dr619: yes // dr620: dup 568 -namespace dr621 { +namespace dr621 { // dr621: yes template T f(); template<> int f() {} // expected-note {{previous}} template<> int f() {} // expected-error {{redefinition}} diff --git a/clang/test/CXX/drs/dr8xx.cpp b/clang/test/CXX/drs/dr8xx.cpp new file mode 100644 index 0000000000000..d79430f9e03ca --- /dev/null +++ b/clang/test/CXX/drs/dr8xx.cpp @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s + +// expected-no-diagnostics + +namespace dr873 { // dr873: yes +#if __cplusplus >= 201103L +template void f(T &&); +template <> void f(int &) {} // #1 +template <> void f(int &&) {} // #2 +void g(int i) { + f(i); // calls f(int&), i.e., #1 +#pragma clang __debug dump f(i) + // CHECK: CallExpr {{.*}} + // CHECK-NEXT: |-ImplicitCastExpr {{.*}} + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'f' 'void (int &)' {{.*}} + + f(0); // calls f(int&&), i.e., #2 +#pragma clang __debug dump f(0) + // CHECK: CallExpr {{.*}} + // CHECK-NEXT: |-ImplicitCastExpr {{.*}} + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'f' 'void (int &&)' {{.*}} +} +#endif +} // namespace dr873 diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index ec2ac24450832..77790bceea851 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -3767,7 +3767,7 @@

C++ defect report implementation status

621 C++11 Template argument deduction from function return types - Unknown + Yes 622 @@ -5117,7 +5117,7 @@

C++ defect report implementation status

873 C++11 Deducing rvalue references in declarative contexts - Unknown + Yes 874 From 548fa1d3086f5fe6e6e1bf52bb661e00a954503e Mon Sep 17 00:00:00 2001 From: LiaoChunyu Date: Wed, 31 May 2023 15:04:39 +0800 Subject: [PATCH 217/704] [RISCV] Add special case for (select cc, 1.0, 0.0) to lowerSELECT Use sint_to_fp instead of select. Reduce the number of branch instructions and avoid generating TargetConstantPool for double. (select cc, 1.0, 0.0) -> (sint_to_fp (zext cc)) https://alive2.llvm.org/ce/z/aoEcd9 https://godbolt.org/z/n543Y9v3e (select cc, 0.0, 1.0) -> (sint_to_fp (zext (xor cc, 1))) https://alive2.llvm.org/ce/z/zngvSB Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D151719 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 14 ++++ llvm/test/CodeGen/RISCV/double-select-icmp.ll | 64 ++++++++++++++++++ llvm/test/CodeGen/RISCV/float-select-icmp.ll | 36 ++++++++++ llvm/test/CodeGen/RISCV/half-select-icmp.ll | 67 +++++++++++++++++++ 4 files changed, 181 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c327bc51d771f..305ad58963651 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5512,6 +5512,20 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget)) return V; + // (select cc, 1.0, 0.0) -> (sint_to_fp (zext cc)) + // (select cc, 0.0, 1.0) -> (sint_to_fp (zext (xor cc, 1))) + const ConstantFPSDNode *FPTV = dyn_cast(TrueV); + const ConstantFPSDNode *FPFV = dyn_cast(FalseV); + if (FPTV && FPFV) { + if (FPTV->isExactlyValue(1.0) && FPFV->isExactlyValue(0.0)) + return DAG.getNode(ISD::SINT_TO_FP, DL, VT, CondV); + if (FPTV->isExactlyValue(0.0) && FPFV->isExactlyValue(1.0)) { + SDValue XOR = DAG.getNode(ISD::XOR, DL, XLenVT, CondV, + DAG.getConstant(1, DL, XLenVT)); + return DAG.getNode(ISD::SINT_TO_FP, DL, VT, XOR); + } + } + // If the condition is not an integer SETCC which operates on XLenVT, we need // to emit a RISCVISD::SELECT_CC comparing the condition to zero. i.e.: // (select condv, truev, falsev) diff --git a/llvm/test/CodeGen/RISCV/double-select-icmp.ll b/llvm/test/CodeGen/RISCV/double-select-icmp.ll index 259ba6a2b5006..9fdab5f7b8a59 100644 --- a/llvm/test/CodeGen/RISCV/double-select-icmp.ll +++ b/llvm/test/CodeGen/RISCV/double-select-icmp.ll @@ -448,3 +448,67 @@ define double @select_icmp_sle(i32 signext %a, i32 signext %b, double %c, double %2 = select i1 %1, double %c, double %d ret double %2 } + +define double @select_icmp_slt_one(i32 signext %a) { +; CHECK-LABEL: select_icmp_slt_one: +; CHECK: # %bb.0: +; CHECK-NEXT: slti a0, a0, 1 +; CHECK-NEXT: fcvt.d.w fa0, a0 +; CHECK-NEXT: ret +; +; RV32ZDINX-LABEL: select_icmp_slt_one: +; RV32ZDINX: # %bb.0: +; RV32ZDINX-NEXT: addi sp, sp, -16 +; RV32ZDINX-NEXT: .cfi_def_cfa_offset 16 +; RV32ZDINX-NEXT: slti a0, a0, 1 +; RV32ZDINX-NEXT: fcvt.d.w a0, a0 +; RV32ZDINX-NEXT: sw a0, 8(sp) +; RV32ZDINX-NEXT: sw a1, 12(sp) +; RV32ZDINX-NEXT: lw a0, 8(sp) +; RV32ZDINX-NEXT: lw a1, 12(sp) +; RV32ZDINX-NEXT: addi sp, sp, 16 +; RV32ZDINX-NEXT: ret +; +; RV64ZDINX-LABEL: select_icmp_slt_one: +; RV64ZDINX: # %bb.0: +; RV64ZDINX-NEXT: slti a0, a0, 1 +; RV64ZDINX-NEXT: fcvt.d.w a0, a0 +; RV64ZDINX-NEXT: ret + %1 = icmp slt i32 %a, 1 + %2 = select i1 %1, double 1.000000e+00, double 0.000000e+00 + ret double %2 +} + +define double @select_icmp_sgt_zero(i32 signext %a) { +; CHECK-LABEL: select_icmp_sgt_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: sgtz a0, a0 +; CHECK-NEXT: xori a0, a0, 1 +; CHECK-NEXT: fcvt.d.w fa0, a0 +; CHECK-NEXT: ret +; +; RV32ZDINX-LABEL: select_icmp_sgt_zero: +; RV32ZDINX: # %bb.0: +; RV32ZDINX-NEXT: addi sp, sp, -16 +; RV32ZDINX-NEXT: .cfi_def_cfa_offset 16 +; RV32ZDINX-NEXT: sgtz a0, a0 +; RV32ZDINX-NEXT: xori a0, a0, 1 +; RV32ZDINX-NEXT: fcvt.d.w a0, a0 +; RV32ZDINX-NEXT: sw a0, 8(sp) +; RV32ZDINX-NEXT: sw a1, 12(sp) +; RV32ZDINX-NEXT: lw a0, 8(sp) +; RV32ZDINX-NEXT: lw a1, 12(sp) +; RV32ZDINX-NEXT: addi sp, sp, 16 +; RV32ZDINX-NEXT: ret +; +; RV64ZDINX-LABEL: select_icmp_sgt_zero: +; RV64ZDINX: # %bb.0: +; RV64ZDINX-NEXT: sgtz a0, a0 +; RV64ZDINX-NEXT: xori a0, a0, 1 +; RV64ZDINX-NEXT: fcvt.d.w a0, a0 +; RV64ZDINX-NEXT: ret + %1 = icmp sgt i32 %a, 0 + %2 = select i1 %1, double 0.000000e+00, double 1.000000e+00 + ret double %2 +} + diff --git a/llvm/test/CodeGen/RISCV/float-select-icmp.ll b/llvm/test/CodeGen/RISCV/float-select-icmp.ll index f46f25f777ca9..1d2aeaaf917d4 100644 --- a/llvm/test/CodeGen/RISCV/float-select-icmp.ll +++ b/llvm/test/CodeGen/RISCV/float-select-icmp.ll @@ -227,3 +227,39 @@ define float @select_icmp_sle(i32 signext %a, i32 signext %b, float %c, float %d %2 = select i1 %1, float %c, float %d ret float %2 } + +define float @select_icmp_slt_one(i32 signext %a) { +; CHECK-LABEL: select_icmp_slt_one: +; CHECK: # %bb.0: +; CHECK-NEXT: slti a0, a0, 1 +; CHECK-NEXT: fcvt.s.w fa0, a0 +; CHECK-NEXT: ret +; +; CHECKZFINX-LABEL: select_icmp_slt_one: +; CHECKZFINX: # %bb.0: +; CHECKZFINX-NEXT: slti a0, a0, 1 +; CHECKZFINX-NEXT: fcvt.s.w a0, a0 +; CHECKZFINX-NEXT: ret + %1 = icmp slt i32 %a, 1 + %2 = select i1 %1, float 1.000000e+00, float 0.000000e+00 + ret float %2 +} + +define float @select_icmp_sgt_zero(i32 signext %a) { +; CHECK-LABEL: select_icmp_sgt_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: sgtz a0, a0 +; CHECK-NEXT: xori a0, a0, 1 +; CHECK-NEXT: fcvt.s.w fa0, a0 +; CHECK-NEXT: ret +; +; CHECKZFINX-LABEL: select_icmp_sgt_zero: +; CHECKZFINX: # %bb.0: +; CHECKZFINX-NEXT: sgtz a0, a0 +; CHECKZFINX-NEXT: xori a0, a0, 1 +; CHECKZFINX-NEXT: fcvt.s.w a0, a0 +; CHECKZFINX-NEXT: ret + %1 = icmp sgt i32 %a, 0 + %2 = select i1 %1, float 0.000000e+00, float 1.000000e+00 + ret float %2 +} diff --git a/llvm/test/CodeGen/RISCV/half-select-icmp.ll b/llvm/test/CodeGen/RISCV/half-select-icmp.ll index 921ca184745bc..465579abe4a80 100644 --- a/llvm/test/CodeGen/RISCV/half-select-icmp.ll +++ b/llvm/test/CodeGen/RISCV/half-select-icmp.ll @@ -475,3 +475,70 @@ define half @select_icmp_sle(i32 signext %a, i32 signext %b, half %c, half %d) { %2 = select i1 %1, half %c, half %d ret half %2 } + +define half @select_icmp_slt_one(i32 signext %a) { +; CHECK-LABEL: select_icmp_slt_one: +; CHECK: # %bb.0: +; CHECK-NEXT: slti a0, a0, 1 +; CHECK-NEXT: fcvt.h.w fa0, a0 +; CHECK-NEXT: ret +; +; CHECKIZHINX-LABEL: select_icmp_slt_one: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: slti a0, a0, 1 +; CHECKIZHINX-NEXT: fcvt.h.w a0, a0 +; CHECKIZHINX-NEXT: ret +; +; CHECKIZFHMIN-LABEL: select_icmp_slt_one: +; CHECKIZFHMIN: # %bb.0: +; CHECKIZFHMIN-NEXT: slti a0, a0, 1 +; CHECKIZFHMIN-NEXT: fcvt.s.w fa5, a0 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: ret +; +; CHECKIZHINXMIN-LABEL: select_icmp_slt_one: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: slti a0, a0, 1 +; CHECKIZHINXMIN-NEXT: fcvt.s.w a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret + %1 = icmp slt i32 %a, 1 + %2 = select i1 %1, half 1.000000e+00, half 0.000000e+00 + ret half %2 +} + +define half @select_icmp_sgt_zero(i32 signext %a) { +; CHECK-LABEL: select_icmp_sgt_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: sgtz a0, a0 +; CHECK-NEXT: xori a0, a0, 1 +; CHECK-NEXT: fcvt.h.w fa0, a0 +; CHECK-NEXT: ret +; +; CHECKIZHINX-LABEL: select_icmp_sgt_zero: +; CHECKIZHINX: # %bb.0: +; CHECKIZHINX-NEXT: sgtz a0, a0 +; CHECKIZHINX-NEXT: xori a0, a0, 1 +; CHECKIZHINX-NEXT: fcvt.h.w a0, a0 +; CHECKIZHINX-NEXT: ret +; +; CHECKIZFHMIN-LABEL: select_icmp_sgt_zero: +; CHECKIZFHMIN: # %bb.0: +; CHECKIZFHMIN-NEXT: sgtz a0, a0 +; CHECKIZFHMIN-NEXT: xori a0, a0, 1 +; CHECKIZFHMIN-NEXT: fcvt.s.w fa5, a0 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: ret +; +; CHECKIZHINXMIN-LABEL: select_icmp_sgt_zero: +; CHECKIZHINXMIN: # %bb.0: +; CHECKIZHINXMIN-NEXT: sgtz a0, a0 +; CHECKIZHINXMIN-NEXT: xori a0, a0, 1 +; CHECKIZHINXMIN-NEXT: fcvt.s.w a0, a0 +; CHECKIZHINXMIN-NEXT: fcvt.h.s a0, a0 +; CHECKIZHINXMIN-NEXT: ret + %1 = icmp sgt i32 %a, 0 + %2 = select i1 %1, half 0.000000e+00, half 1.000000e+00 + ret half %2 +} + From b7e5cb1f9a3a5226f22bb81c865214be81dce940 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Wed, 17 May 2023 20:23:06 +0200 Subject: [PATCH 218/704] [clang][NFC] Refactor emitSnippet() Rename parameters and local variables and reorder things a bit to be closer to their first point of use. Differential Revision: https://reviews.llvm.org/D150840 --- clang/lib/Frontend/TextDiagnostic.cpp | 38 +++++++++++++-------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp index baf9b017fc83e..51b901180ee5a 100644 --- a/clang/lib/Frontend/TextDiagnostic.cpp +++ b/clang/lib/Frontend/TextDiagnostic.cpp @@ -1298,16 +1298,12 @@ void TextDiagnostic::emitSnippetAndCaret( emitParseableFixits(Hints, SM); } -void TextDiagnostic::emitSnippet(StringRef line, unsigned MaxLineNoDisplayWidth, +void TextDiagnostic::emitSnippet(StringRef SourceLine, + unsigned MaxLineNoDisplayWidth, unsigned LineNo) { - if (line.empty()) + if (SourceLine.empty()) return; - size_t i = 0; - - std::string to_print; - bool print_reversed = false; - // Emit line number. if (MaxLineNoDisplayWidth > 0) { unsigned LineNoDisplayWidth = getNumDisplayWidth(LineNo); @@ -1318,28 +1314,30 @@ void TextDiagnostic::emitSnippet(StringRef line, unsigned MaxLineNoDisplayWidth, OS << " | "; } - while (i,bool> res - = printableTextForNextCharacter(line, &i, DiagOpts->TabStop); - bool was_printable = res.second; + bool PrintReversed = false; + std::string ToPrint; + size_t I = 0; + while (I < SourceLine.size()) { + auto [Str, WasPrintable] = + printableTextForNextCharacter(SourceLine, &I, DiagOpts->TabStop); - if (DiagOpts->ShowColors && was_printable == print_reversed) { - if (print_reversed) + if (DiagOpts->ShowColors && WasPrintable == PrintReversed) { + if (PrintReversed) OS.reverseColor(); - OS << to_print; - to_print.clear(); + OS << ToPrint; + ToPrint.clear(); if (DiagOpts->ShowColors) OS.resetColor(); } - print_reversed = !was_printable; - to_print += res.first.str(); + PrintReversed = !WasPrintable; + ToPrint += Str; } - if (print_reversed && DiagOpts->ShowColors) + if (PrintReversed && DiagOpts->ShowColors) OS.reverseColor(); - OS << to_print; - if (print_reversed && DiagOpts->ShowColors) + OS << ToPrint; + if (PrintReversed && DiagOpts->ShowColors) OS.resetColor(); OS << '\n'; From 6614d36d711495c57d8971f2330065c7cd21b9ef Mon Sep 17 00:00:00 2001 From: Hristo Hristov Date: Tue, 23 May 2023 15:06:41 +0300 Subject: [PATCH 219/704] [libc++][spaceship] Additional tests for `operator<=>` `map` and `multimap` - Added additional tests - Improved existing tests - Moved misplaced test files to the correct location Reviewed By: #libc, philnik Differential Revision: https://reviews.llvm.org/D151205 --- .../compare.three_way.pass.cpp | 0 .../compare.three_way.verify.cpp | 61 ++++++++++ .../compare.three_way.pass.cpp | 0 .../compare.three_way.verify.cpp | 61 ++++++++++ .../test/support/test_container_comparisons.h | 109 ++++++++++-------- 5 files changed, 182 insertions(+), 49 deletions(-) rename libcxx/test/std/containers/associative/map/{ => map.nonmember}/compare.three_way.pass.cpp (100%) create mode 100644 libcxx/test/std/containers/associative/map/map.nonmember/compare.three_way.verify.cpp rename libcxx/test/std/containers/associative/multimap/{ => multimap.nonmember}/compare.three_way.pass.cpp (100%) create mode 100644 libcxx/test/std/containers/associative/multimap/multimap.nonmember/compare.three_way.verify.cpp diff --git a/libcxx/test/std/containers/associative/map/compare.three_way.pass.cpp b/libcxx/test/std/containers/associative/map/map.nonmember/compare.three_way.pass.cpp similarity index 100% rename from libcxx/test/std/containers/associative/map/compare.three_way.pass.cpp rename to libcxx/test/std/containers/associative/map/map.nonmember/compare.three_way.pass.cpp diff --git a/libcxx/test/std/containers/associative/map/map.nonmember/compare.three_way.verify.cpp b/libcxx/test/std/containers/associative/map/map.nonmember/compare.three_way.verify.cpp new file mode 100644 index 0000000000000..a1cd2ab29bd2a --- /dev/null +++ b/libcxx/test/std/containers/associative/map/map.nonmember/compare.three_way.verify.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// + +// class map + +// template +// synth-three-way-result> +// operator<=>(const map& x, +// const map& y); + +#include + +#include "test_allocator.h" + +int main(int, char**) { + // Mismatching allocators + { + std::map, std::allocator> s1; + std::map, test_allocator> s2; + // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed due to requirement 'is_same>::value'{{.*}}Allocator::value_type must be same type as value_type}} + s1 <=> s2; + // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed due to requirement 'is_same>::value'{{.*}}Allocator::value_type must be same type as value_type}} + s2 <=> s1; + } + // Mismatching comparision functions + { + std::map> s1; + std::map> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s1 <=> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s2 <=> s1; + } + { + std::map> s1; + std::map> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s1 <=> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s2 <=> s1; + } + // Mismatching types + { + std::map s1; + std::map s2; + // expected-error@+1 {{invalid operands to binary expression}} + s1 <=> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s2 <=> s1; + } + + return 0; +} diff --git a/libcxx/test/std/containers/associative/multimap/compare.three_way.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.nonmember/compare.three_way.pass.cpp similarity index 100% rename from libcxx/test/std/containers/associative/multimap/compare.three_way.pass.cpp rename to libcxx/test/std/containers/associative/multimap/multimap.nonmember/compare.three_way.pass.cpp diff --git a/libcxx/test/std/containers/associative/multimap/multimap.nonmember/compare.three_way.verify.cpp b/libcxx/test/std/containers/associative/multimap/multimap.nonmember/compare.three_way.verify.cpp new file mode 100644 index 0000000000000..a6b3a0c99cf08 --- /dev/null +++ b/libcxx/test/std/containers/associative/multimap/multimap.nonmember/compare.three_way.verify.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// + +// class multimap + +// template +// synth-three-way-result> +// operator<=>(const multimap& x, +// const multimap& y); + +#include + +#include "test_allocator.h" + +int main(int, char**) { + // Mismatching allocators + { + std::multimap, std::allocator> s1; + std::multimap, test_allocator> s2; + // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed due to requirement 'is_same>::value'{{.*}}Allocator::value_type must be same type as value_type}} + s1 <=> s2; + // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed due to requirement 'is_same>::value'{{.*}}Allocator::value_type must be same type as value_type}} + s2 <=> s1; + } + // Mismatching comparision functions + { + std::multimap> s1; + std::multimap> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s1 <=> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s2 <=> s1; + } + { + std::multimap> s1; + std::multimap> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s1 <=> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s2 <=> s1; + } + // Mismatching types + { + std::multimap s1; + std::multimap s2; + // expected-error@+1 {{invalid operands to binary expression}} + s1 <=> s2; + // expected-error@+1 {{invalid operands to binary expression}} + s2 <=> s1; + } + + return 0; +} diff --git a/libcxx/test/support/test_container_comparisons.h b/libcxx/test/support/test_container_comparisons.h index d3b4033039c37..8748f2d8efdd3 100644 --- a/libcxx/test/support/test_container_comparisons.h +++ b/libcxx/test/support/test_container_comparisons.h @@ -10,6 +10,7 @@ #ifndef TEST_CONTAINER_COMPARISONS #define TEST_CONTAINER_COMPARISONS +#include #include #include "test_comparisons.h" @@ -84,103 +85,109 @@ constexpr bool test_sequence_container_spaceship() { } // Implementation detail of `test_ordered_map_container_spaceship` -template